package org.meaningfulweb.cext.processors;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.meaningfulweb.cext.HtmlContentProcessor;
import org.meaningfulweb.opengraph.OpenGraphParser;
import org.meaningfulweb.util.URIUtils;
import org.meaningfulweb.util.http.HttpClientFactory;
import org.meaningfulweb.util.http.HttpClientService;
public class MeaningfulwebCompositeProcessor extends HtmlContentProcessor {
private static final Logger logger = Logger.getLogger(MeaningfulwebCompositeProcessor.class);
private final OpengraphContentProcessor _opengraphProcessor;
private final ElementProcessor _elementProcessor;
private final BoilerpipeArticleProcessor _boilerpipeProcessor;
private final DomainSpecifiedImageProcessor _domainImgProcessor;
private final BestImageProcessor _bestimageProcessor;
private static final String[] INTERESTED_HEADERS = new String[]{"keywords","description","image"};
public MeaningfulwebCompositeProcessor(){
_opengraphProcessor = new OpengraphContentProcessor();
_opengraphProcessor.setIncludeAll(true);
_elementProcessor = new ElementProcessor();
_elementProcessor.setExtractHtml(false);
_boilerpipeProcessor = new BoilerpipeArticleProcessor();
_domainImgProcessor = new DomainSpecifiedImageProcessor();
_domainImgProcessor.addExtractionHandler("twitpic.com", new TwitpicExtractionHandler());
_domainImgProcessor.addExtractionHandler("www.twitpic.com", new TwitpicExtractionHandler());
_bestimageProcessor = new BestImageProcessor();
}
@Override
public void setName(String name) {
super.setName(name);
_opengraphProcessor.setName(name);
_elementProcessor.setName(name);
_boilerpipeProcessor.setName(name);
_domainImgProcessor.setName(name);
_bestimageProcessor.setName(name);
}
@Override
public void setMetadata(Map<String, Object> metadata) {
super.setMetadata(metadata);
_opengraphProcessor.setMetadata(metadata);
_elementProcessor.setMetadata(metadata);
_boilerpipeProcessor.setMetadata(metadata);
_domainImgProcessor.setMetadata(metadata);
_bestimageProcessor.setMetadata(metadata);
}
@Override
public boolean processContent(Document document) {
boolean success;
Map<String,Object> currentlyExtracted = getExtracted();
success = _opengraphProcessor.processContent(document);
if (success){
Map<String,Object> extracted = _opengraphProcessor.getExtracted();
currentlyExtracted.putAll(extracted);
}
if (getExtracted().get("title")==null){
_elementProcessor.setElements(Arrays.asList(new String[]{"title"}));
}
Set<String> headerSet = new HashSet<String>();
for (String header : INTERESTED_HEADERS){
if (currentlyExtracted.get(header)==null){
headerSet.add(header);
}
}
if (headerSet.size()>0){
_elementProcessor.setHeaders(headerSet);
}
if (_elementProcessor.getHeaders().size()>0 || _elementProcessor.getElements().size()>0){
success = _elementProcessor.processContent(document);
if (success){
Map<String,Object> extracted = _elementProcessor.getExtracted();
Set<Entry<String,Object>> entries = extracted.entrySet();
for (Entry<String,Object> entry : entries){
String key = entry.getKey();
Object valObj = entry.getValue();
if (valObj!=null){
if (valObj instanceof String){
String val = (String)(valObj);
if (OpenGraphParser.UNESCAPE_HTML_FIELDS.contains(key)){
valObj = StringEscapeUtils.unescapeHtml(val);
}
}
currentlyExtracted.put(key, valObj);
}
}
}
}
success = _boilerpipeProcessor.processContent(document);
if (success){
Map<String,Object> extracted = _boilerpipeProcessor.getExtracted();
getExtracted().putAll(extracted);
}
success = _domainImgProcessor.processContent(document);
if (success){
Map<String,Object> extracted = _domainImgProcessor.getExtracted();
getExtracted().putAll(extracted);
}
if (currentlyExtracted.get("image")==null){
success = _bestimageProcessor.processContent(document);
if (success){
Map<String,Object> extracted = _bestimageProcessor.getExtracted();
getExtracted().putAll(extracted);
}
}
Object imgUrlObj;
if ((imgUrlObj = currentlyExtracted.get("image"))!=null){
Object imgSize = currentlyExtracted.get("image-content-length");
if (imgSize==null){
String imgUrl = (String)imgUrlObj;
HttpClientService httpClient = HttpClientFactory.getHttpClientService();
boolean isValidURI = URIUtils.isValidURI(imgUrl);
if (!isValidURI) {
imgUrl = URIUtils.fixInvalidUri(imgUrl);
}
HttpGet httpGet= new HttpGet(imgUrl);
try{
HttpEntity entity = httpClient.doGet(httpGet);
if (entity!=null){
currentlyExtracted.put("image-content-length", String.valueOf(entity.getContentLength()));
}
}
catch(Exception e){
logger.error(e.getMessage(),e);
}
finally{
httpGet.abort();
}
}
if (currentlyExtracted.get("fullimage")==null){
currentlyExtracted.put("fullimage", imgUrlObj);
}
}
return success;
}
}