package org.docear.metadata.extractors;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.docear.metadata.events.MetaDataListener;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.xml.DomDriver;
public abstract class HtmlDataExtractor implements MetaDataExtractor{
protected final static Logger logger = LoggerFactory.getLogger(HtmlDataExtractor.class);
private String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0";
private String referrer = "http://www.google.com";
private int timeout = 3000;
private boolean followRedirects = true;
protected boolean debuglogging = false;
private String cookieFolder = System.getProperty("user.home");
protected String searchValue = "";
protected int maxResults = 3;
private Map<ExtractorConfigKey, Object> config = new HashMap<ExtractorConfigKey, Object>();
private ArrayList<MetaDataListener> listeners = new ArrayList<MetaDataListener>();
public enum CommonConfigKeys implements ExtractorConfigKey{
SEARCHVALUE,
TIMEOUT,
USERAGENT,
REFERRER,
FOLLOWREDIRECTS,
COOKIE_FOLDER,
MAXRESULTS,
DEBUGLOGGING;
}
public HtmlDataExtractor(){};
public HtmlDataExtractor(Map<ExtractorConfigKey, Object> config) throws MalformedConfigException{
readConfig(config);
}
public HtmlDataExtractor(Map<ExtractorConfigKey, Object> config, MetaDataListener listener) throws MalformedConfigException {
readConfig(config);
this.addListeners(listener);
}
protected void readConfig(Map<ExtractorConfigKey, Object> config) throws MalformedConfigException{
for(Object value : config.values()){
if(value == null){
logger.warn("Null value in config map.");
throw new MalformedConfigException();
}
}
try{
for(ExtractorConfigKey key : config.keySet()){
if(key instanceof CommonConfigKeys){
CommonConfigKeys commonKey = (CommonConfigKeys)key;
switch(commonKey){
case USERAGENT:
this.userAgent = (String) config.get(CommonConfigKeys.USERAGENT);
break;
case REFERRER:
this.referrer = (String) config.get(CommonConfigKeys.REFERRER);
break;
case COOKIE_FOLDER:
this.cookieFolder = (String) config.get(CommonConfigKeys.COOKIE_FOLDER);
break;
case TIMEOUT:
this.timeout = (Integer) config.get(CommonConfigKeys.TIMEOUT);
break;
case FOLLOWREDIRECTS:
this.followRedirects = (Boolean) config.get(CommonConfigKeys.FOLLOWREDIRECTS);
break;
case MAXRESULTS:
this.maxResults = (Integer) config.get(CommonConfigKeys.MAXRESULTS);
break;
case SEARCHVALUE:
this.searchValue = (String) config.get(CommonConfigKeys.SEARCHVALUE);
break;
case DEBUGLOGGING:
this.debuglogging = (Boolean) config.get(CommonConfigKeys.DEBUGLOGGING);
break;
default:
break;
}
}
}
}catch(ClassCastException e){
logger.error("Could not cast config parameter.", e);
throw new MalformedConfigException();
}
}
protected Connection getConnection(String URL) {
return Jsoup.connect(URL)
.ignoreContentType(true)
.userAgent(this.userAgent)
.referrer(this.referrer)
.timeout(this.timeout)
.followRedirects(this.followRedirects);
}
protected boolean saveCookies(Map<String, String> cookies, String cookieFileName) {
boolean correctSaved = true;
String path = getPath(cookieFileName);
XStream xStream = new XStream(new DomDriver());
xStream.alias("map", java.util.Map.class);
String xml = xStream.toXML(cookies);
FileOutputStream fos = null;
try {
fos = new FileOutputStream(path);
fos.write("<?xml version=\"1.0\"?>\n".getBytes("UTF-8"));
byte[] bytes = xml.getBytes("UTF-8");
fos.write(bytes);
} catch(Exception e) {
logger.error("Could not write cookie data to " + path, e);
correctSaved = false;
} finally {
if(fos!=null) {
try{
fos.close();
} catch (IOException e) {
logger.info("Could not write cookie data to " + path);
correctSaved = false;
}
}
}
return correctSaved;
}
protected Map<String, String> readCookies(String cookieFileName) {
String path = getPath(cookieFileName);
if(!new File(path).exists()){
return null;
}
XStream xStream = new XStream(new DomDriver());
xStream.alias("map", java.util.Map.class);
try{
@SuppressWarnings("unchecked")
Map<String,String> cookies = (Map<String,String>)xStream.fromXML(new File(path));
return cookies;
}catch(Exception e){
logger.info("Could not read cookie data from " + path);
}
return null;
}
protected String getPath(String cookieFileName) {
String path = this.cookieFolder;
if(path.endsWith(File.separator)){
path = path + cookieFileName;
}
else{
path = path + File.separator + cookieFileName;
}
return path;
}
public Map<ExtractorConfigKey, Object> getConfig() {
return config;
}
public void setConfig(Map<ExtractorConfigKey, Object> config) throws MalformedConfigException {
this.config = config;
readConfig(config);
}
public ArrayList<MetaDataListener> getListeners() {
return listeners;
}
public boolean removeListener(MetaDataListener listener) {
return this.listeners.remove(listener);
}
public void addListeners(MetaDataListener listener) {
this.listeners.add(listener);
}
}