package org.meaningfulweb.cext.processors;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.meaningfulweb.cext.HtmlContentProcessor;
import org.meaningfulweb.util.XMLUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.jdom.Attribute;
import org.jdom.Content;
import org.jdom.Document;
import org.jdom.Element;
public class ImageProcessor
extends HtmlContentProcessor {
private int maxRecurseDepth = 200;
private int imageMinWidth = 0;
private int imageMinHeight = 0;
private boolean removeImagesNoWidthHeight = true;
private Set<String> imageExclusions = new LinkedHashSet<String>();
{
String[] commonImageNames = {};
imageExclusions.addAll(Arrays.asList(commonImageNames));
}
private void extractFromNodes(int level, Content node) {
// don't go on forever, spider traps can kill JVM through stack overflow
if (node == null || level == maxRecurseDepth) {
return;
}
if (node instanceof Element) {
Element elem = (Element)node;
String name = StringUtils.lowerCase(elem.getName());
// extract out elements by name
if (StringUtils.equalsIgnoreCase(name, "img")) {
Attribute widthAttr = elem.getAttribute("width");
Attribute heightAttr = elem.getAttribute("height");
int width = -1;
int height = -1;
if (widthAttr != null) {
String widthVal = StringUtils.lowerCase(widthAttr.getValue());
width = NumberUtils.toInt(widthVal, -1);
}
if (heightAttr != null) {
String heightVal = StringUtils.lowerCase(heightAttr.getValue());
height = NumberUtils.toInt(heightVal, -1);
}
Attribute srcAttr = elem.getAttribute("src");
String src = null;
if (srcAttr != null) {
src = srcAttr.getValue();
}
boolean hasWidth = (!removeImagesNoWidthHeight && width == -1)
|| width >= imageMinWidth;
boolean hasHeight = (!removeImagesNoWidthHeight && height == -1)
|| height >= imageMinHeight;
boolean isExcluded = false;
for (String excluded : imageExclusions) {
if (StringUtils.contains(src, excluded)) {
isExcluded = true;
break;
}
}
if (!isExcluded && hasWidth && hasHeight) {
addExtractedValue("html", XMLUtils.toHtml(elem));
}
}
List<Content> children = elem.getContent();
if (children != null && children.size() > 0) {
for (Content child : children) {
extractFromNodes(++level, child);
}
}
}
}
public int getMaxRecurseDepth() {
return maxRecurseDepth;
}
public void setMaxRecurseDepth(int maxRecurseDepth) {
this.maxRecurseDepth = maxRecurseDepth;
}
public boolean isRemoveImagesNoWidthHeight() {
return removeImagesNoWidthHeight;
}
public void setRemoveImagesNoWidthHeight(boolean removeImagesNoWidthHeight) {
this.removeImagesNoWidthHeight = removeImagesNoWidthHeight;
}
public int getImageMinWidth() {
return imageMinWidth;
}
public void setImageMinWidth(int imageMinWidth) {
this.imageMinWidth = imageMinWidth;
}
public int getImageMinHeight() {
return imageMinHeight;
}
public void setImageMinHeight(int imageMinHeight) {
this.imageMinHeight = imageMinHeight;
}
public Collection<String> getImageExclusions() {
return imageExclusions;
}
public void setImageExclusions(Collection<String> imageExclusions) {
if (imageExclusions != null) {
if (imageExclusions instanceof Set) {
this.imageExclusions = (Set<String>)imageExclusions;
}
else {
Set<String> newExclusions = new LinkedHashSet<String>();
newExclusions.addAll(imageExclusions);
this.imageExclusions = newExclusions;
}
}
}
@Override
public boolean processContent(Document document) {
Element rootElem = document.getRootElement();
List<Content> contents = rootElem.getContent();
for (Content child : contents) {
extractFromNodes(0, child);
}
return true;
}
}