package org.meaningfulweb.cext.processors;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.jdom.Attribute;
import org.jdom.Content;
import org.jdom.Document;
import org.jdom.Element;
import org.meaningfulweb.cext.HtmlContentProcessor;
import org.meaningfulweb.imgext.ExtractedContents;
import org.meaningfulweb.imgext.ImageFetcher;
import org.meaningfulweb.imgext.ImageFilter;
import org.meaningfulweb.imgext.ImageInfo;
import org.meaningfulweb.imgext.ImageMeta;
import org.meaningfulweb.imgext.ImageSelector;
import org.meaningfulweb.util.URLUtil;
public class BestImageProcessor
extends HtmlContentProcessor {
private int imageMinWidth = 0;
private int imageMinHeight = 0;
private boolean removeImagesNoWidthHeight = true;
private int maxRecurseDepth = 250;
private final ImageFilter imageFilter = new ImageFilter();
private final ImageFetcher imageFetcher = new ImageFetcher();
private final ImageSelector imgSelector = new ImageSelector(imageFilter,
imageFetcher);
private void extractFromNodes(String baseUrl, String contextUrl,int level, Content node,
LinkedList<ImageMeta> imgMetas) {
// don't go on forever, spider traps can kill JVM through stack overflow
if (node == null || level == maxRecurseDepth) {
return;
}
if (node instanceof Element) {
Element elem = (Element)node;
String name = StringUtils.lowerCase(elem.getName());
// extract out elements by name
if (StringUtils.equalsIgnoreCase(name, "img")) {
Attribute widthAttr = elem.getAttribute("width");
Attribute heightAttr = elem.getAttribute("height");
String width = null;
if (widthAttr != null) {
width = widthAttr.getValue();
}
String height = null;
if (heightAttr != null) {
height = heightAttr.getValue();
}
/*
int widthVal = NumberUtils.toInt(width, -1);
int heightVal = NumberUtils.toInt(height, -1);
boolean hasWidth = (!removeImagesNoWidthHeight && widthVal == -1)
|| widthVal >= imageMinWidth;
boolean hasHeight = (!removeImagesNoWidthHeight && heightVal == -1)
|| heightVal >= imageMinHeight;
*/
Attribute srcAttr = elem.getAttribute("src");
String src = null;
if (srcAttr != null) {
src = srcAttr.getValue();
}
Attribute titleAttr = elem.getAttribute("title");
String title = null;
if (titleAttr != null) {
title = StringUtils.lowerCase(titleAttr.getValue());
}
Attribute altAttr = elem.getAttribute("alt");
String alt = null;
if (altAttr != null) {
alt = StringUtils.lowerCase(altAttr.getValue());
}
Attribute onclickAttr = elem.getAttribute("onclick");
String onclick = null;
if (onclickAttr != null) {
onclick = StringUtils.lowerCase(onclickAttr.getValue());
}
if (src!=null) {
String url = URLUtil.toAbsoluteURL(baseUrl, contextUrl,src);
ImageMeta imgInfo = new ImageMeta(imgMetas.size(), alt, title, width,
height,-1L, url, onclick);
imgMetas.add(imgInfo);
}
}
List<Content> children = elem.getContent();
if (children != null && children.size() > 0) {
for (Content child : children) {
extractFromNodes(baseUrl, contextUrl,++level, child, imgMetas);
}
}
}
}
public int getMaxRecurseDepth() {
return maxRecurseDepth;
}
public void setMaxRecurseDepth(int maxRecurseDepth) {
this.maxRecurseDepth = maxRecurseDepth;
}
public boolean isRemoveImagesNoWidthHeight() {
return removeImagesNoWidthHeight;
}
public void setRemoveImagesNoWidthHeight(boolean removeImagesNoWidthHeight) {
this.removeImagesNoWidthHeight = removeImagesNoWidthHeight;
}
public int getImageMinWidth() {
return imageMinWidth;
}
public void setImageMinWidth(int imageMinWidth) {
this.imageMinWidth = imageMinWidth;
}
public int getImageMinHeight() {
return imageMinHeight;
}
public void setImageMinHeight(int imageMinHeight) {
this.imageMinHeight = imageMinHeight;
}
@Override
public boolean processContent(Document document) {
Map<String,Object> extractedMap = this.getExtracted();
// image already extracted
if (extractedMap.containsKey("image")){
return true;
}
String baseUrl = null;
String url = (String)getMetadata().get("url");
String protocol = URLUtil.getProtocol(url);
String host = URLUtil.getHost(url);
baseUrl = protocol + "://" + host + "/";
LinkedList<ImageMeta> imgMetas = new LinkedList<ImageMeta>();
Element rootElem = document.getRootElement();
List<Content> contents = rootElem.getContent();
for (Content child : contents) {
extractFromNodes(baseUrl, url,0, child, imgMetas);
}
ExtractedContents extracted = new ExtractedContents(baseUrl, imgMetas);
ImageInfo mediaContentInfo = imgSelector.getBestImage(extracted, baseUrl,
true, true);
if (mediaContentInfo != null) {
addExtractedValue("image", mediaContentInfo.getUri());
Long size = mediaContentInfo.getSize();
if (size!=null && size>0){
addExtractedValue("image-content-length",size);
}
}
return true;
}
}