/*
* Copyright 2011 Marek Pilecky
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.mefi.jkuuza.parser;
import java.util.ArrayList;
import java.util.Iterator;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.select.Selector;
/**
* Holds functions encapsulating JSoup functions, to extracting values from html elements
* Annotated functions have some additional informations, which can be used in gui
*
* @author Marek Pilecky
*/
public class ContentExtractor extends ContentHelper {
public ContentExtractor() {
}
/**
* Needs non empty Jsoup Document instance
*
* @param doc Jsoup Document
*/
public ContentExtractor(Document doc) {
this.doc = doc;
}
/**
* Gets title from page
*
* @return title of page
*/
public String getTitle() {
String title = doc.title();
return title;
}
/**
* Returns description from meta tag
*
* @return description value of content attribute
*/
public String getMetaDescription() {
String description = doc.select("meta[name=description]").first().attr("content");
return description;
}
/**
* Checks, if header of page contains meta tag with description
*
* @return true if page contains description
*/
public boolean hasMetaDescription() {
Elements elements = doc.head().select("meta[name=description]");
if (elements.isEmpty() || !elements.first().hasAttr("content")) {
return false;
}
return true;
}
/**
* Returns keywords from meta tag
*
* @return keywords value of content attribute
*/
public String getMetaKeywords() {
String keywords = doc.select("meta[name=keywords]").first().attr("content");
return keywords;
}
/**
* Checks, if header of page contains meta tag with keywords
*
* @return true if page contains keywords
*/
public boolean hasMetaKeywords() {
Elements elements = doc.head().select("meta[name=keywords]");
if (elements.isEmpty() || !elements.first().hasAttr("content")) {
return false;
}
return true;
}
/**
* Returns charset from meta tag
*
* @return charset value of content attribute
*/
public String getMetaCharset() {
String charset = "";
charset = doc.select("meta[http-equiv=content-type]").first().attr("content");
charset = charset.replace("text/html", "");
charset = charset.replace("TEXT/HTML", "");
charset = charset.replace("charset", "");
charset = charset.replace("CHARSET", "");
charset = charset.replace("=", "");
charset = charset.replace(";", "");
charset = charset.replace(" ", "");
return charset;
}
/**
* Checks, if header of page contains meta tag with charset
*
* @return true if page contains charset value
*/
public boolean hasMetaCharset() {
Elements elements = doc.head().select("meta[http-equiv=content-type]");
if (elements.isEmpty() || !elements.first().hasAttr("content")) {
return false;
}
return true;
}
public String getValue(String selector) {
if (selector.equals("")) {
return "";
}
return doc.select(selector).text();
}
public ArrayList<String> getValuesOf(String selector) {
Elements elements = doc.select(selector);
ArrayList<String> list = new ArrayList();
for (Iterator<Element> it = elements.iterator(); it.hasNext();) {
list.add(it.next().text());
}
return list;
}
}