package org.meaningfulweb.cext.processors;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.meaningfulweb.cext.HtmlContentProcessor;
import org.meaningfulweb.util.XMLUtils;
import org.apache.commons.lang.StringUtils;
import org.jdom.Content;
import org.jdom.Document;
import org.jdom.Element;
public class ParagraphProcessor
extends HtmlContentProcessor {
private int sentenceThreshold = 2;
private int numParagraphs = 0;
private int maxRecurseDepth = 250;
private boolean extractHtml = true;
private boolean extractText = true;
private boolean onlyTitled = false;
private int minWords = 18;
private static Set<String> titleContainers = new HashSet<String>();
static {
String[] title = {"h1", "h2", "h3"};
titleContainers.addAll(Arrays.asList(title));
}
private static Set<String> ignoreContainers = new HashSet<String>();
static {
String[] ignore = {"h1", "h2", "h3", "h4", "h5", "head", "script",
"noscript", "style", "form", "meta", "input", "iframe", "embed", "hr",
"img", "link", "label", "table", "td", "th", "tr", "tbody", "thead",
"tfoot", "col", "colgroup", "ul", "ol"};
ignoreContainers.addAll(Arrays.asList(ignore));
}
private static Set<String> textContainers = new HashSet<String>();
static {
String[] containers = {"p", "span"};
textContainers.addAll(Arrays.asList(containers));
}
private boolean findTitledParagraphBlocks(int level, boolean titleFound,
Content node, List<Content> found) {
// don't go on forever, spider traps can kill JVM through stack overflow
if (level == maxRecurseDepth) {
return titleFound;
}
if (node instanceof Element) {
Element elem = (Element)node;
String name = StringUtils.lowerCase(elem.getName());
// ignore certain containers
if (ignoreContainers.contains(name)) {
return titleFound;
}
else {
List<Content> children = elem.getContent();
for (Content child : children) {
if (child instanceof Element) {
Element childElem = (Element)child;
String childName = StringUtils.lowerCase(childElem.getName());
if (titleContainers.contains(childName)) {
titleFound = true;
}
if (titleFound && textContainers.contains(childName)) {
// get the full text of the node tree, break into sentences and
// count the number of sentences
String contentText = XMLUtils.toText(childElem);
BreakIterator sentenceIt = BreakIterator.getSentenceInstance();
sentenceIt.setText(contentText);
int numSentences = 0;
while (sentenceIt.next() != BreakIterator.DONE) {
numSentences++;
}
String trimmed = StringUtils.trim(contentText);
boolean endsWithPunct = (StringUtils.endsWith(trimmed, ".")
|| StringUtils.endsWith(trimmed, "?")
|| StringUtils.endsWith(trimmed, "!") || StringUtils.endsWith(
trimmed, ")"));
String[] words = StringUtils.split(trimmed);
int numWords = words != null ? words.length : 0;
// if greater than a threshold then add to found
if (numSentences >= sentenceThreshold && endsWithPunct
&& numWords >= minWords) {
found.add(childElem);
}
}
else {
boolean foundATitle = findTitledParagraphBlocks(++level,
titleFound, child, found);
if (foundATitle) {
titleFound = true;
}
}
}
}
}
}
return titleFound;
}
private void findParagraphBlocks(int level, Content node, List<Content> found) {
// don't go on forever, spider traps can kill JVM through stack overflow
if (level == maxRecurseDepth) {
return;
}
if (node instanceof Element) {
Element elem = (Element)node;
String name = StringUtils.lowerCase(elem.getName());
// ignore certain containers
if (ignoreContainers.contains(name)) {
return;
}
else if (textContainers.contains(name)) {
// get the full text of the node tree, break into sentences and count
// the number of sentences
String contentText = XMLUtils.toText((Element)node);
BreakIterator sentenceIt = BreakIterator.getSentenceInstance();
sentenceIt.setText(contentText);
int numSentences = 0;
while (sentenceIt.next() != BreakIterator.DONE) {
numSentences++;
}
// if greater than a theshold then add to found
if (numSentences >= sentenceThreshold) {
found.add(node);
}
}
else {
// not a text container and not ignore, recurse to find text container
List<Content> children = elem.getContent();
if (children != null && children.size() > 0) {
for (Content child : children) {
findParagraphBlocks(++level, child, found);
}
}
}
}
}
@Override
public boolean processContent(Document doc) {
List<Content> found = new ArrayList<Content>();
Element rootElem = doc.getRootElement();
// search for all paragraphs or only titled paragraphs
List<Content> contents = rootElem.getContent();
if (onlyTitled) {
for (Content child : contents) {
findTitledParagraphBlocks(0, false, child, found);
}
}
else {
for (Content child : contents) {
findParagraphBlocks(0, child, found);
}
}
int numFound = found != null ? found.size() : 0;
if (numFound > 0) {
for (int i = 0; i < found.size()
&& (numParagraphs == 0 || i < numParagraphs); i++) {
Content content = found.get(i);
// add the content html
if (extractHtml) {
String contentHtml = XMLUtils.toHtml((Element)content);
if (StringUtils.isNotBlank(contentHtml)) {
addExtractedValue("html", contentHtml);
}
}
// add the content text
if (extractText) {
String contentText = XMLUtils.toText((Element)content);
if (StringUtils.isNotBlank(contentText)) {
addExtractedValue("text", contentText);
}
}
}
}
return true;
}
public boolean isOnlyTitled() {
return onlyTitled;
}
public void setOnlyTitled(boolean onlyTitled) {
this.onlyTitled = onlyTitled;
}
public int getSentenceThreshold() {
return sentenceThreshold;
}
public void setSentenceThreshold(int sentenceThreshold) {
this.sentenceThreshold = sentenceThreshold;
}
public int getNumParagraphs() {
return numParagraphs;
}
public void setNumParagraphs(int numParagraphs) {
this.numParagraphs = numParagraphs;
}
public int getMaxRecurseDepth() {
return maxRecurseDepth;
}
public void setMaxRecurseDepth(int maxRecurseDepth) {
this.maxRecurseDepth = maxRecurseDepth;
}
public boolean isExtractHtml() {
return extractHtml;
}
public void setExtractHtml(boolean extractHtml) {
this.extractHtml = extractHtml;
}
public boolean isExtractText() {
return extractText;
}
public void setExtractText(boolean extractText) {
this.extractText = extractText;
}
public int getMinWords() {
return minWords;
}
public void setMinWords(int minWords) {
this.minWords = minWords;
}
}