package nicetext;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
import java.util.*;
import java.util.regex.Pattern;
/**
* @author vikasing
*/
public class NTHelper {
private static final Pattern POSSIBLE_TEXT_NODES = Pattern.compile("p|div|td|h1|h2|h3|article|section|span|tmp|li|font|em");
private static final String[] UNWRAP_TAGS = {"b", "u", "i", "font", "em"};
private static final Pattern ARTICLE_NODES = Pattern.compile("article|section|tmp");
private static final Pattern MAIN_CLASSES_IDS = Pattern.compile("article|section|post|text|blog|story|mainContent");
private static final Pattern CUSTOM_CLASSES_IDS = Pattern.compile("articleBody|article-text|story__content|article-body|artText|postContent|blogContentContainer|story-body|WNStoryBody|articl_cont|storytext|detail_content");
private static final int CL_DIST = 4;
private static final int WORDS_T = 5;
private static final double DEFAULT_RATIO = 0.1;
public String getText(Document document) {
removeFat(document);
Element bodyElement = document.body();
Element articleE = articleFinder(bodyElement);
//niceText.setAllText(bodyElement.text());
//niceText.setPageTitle(document.title());
String text = "";
if (articleE != null) {
text = getText(articleE);
}
if (text.length() < 500) {
text = getText(bodyElement);
}
return text;
}
private String getText(Element bodyElement) {
String text = "";
Elements flattenElements = new Elements(flattenDOM(bodyElement));
if (!flattenElements.isEmpty()) {
Elements elementsOfInterest = calculateBlockSizeRatios(flattenElements);
Set<Element> bestCluster = null;
double maxScore = 0.0;
List<Set<Element>> clusterSet = findClusters(elementsOfInterest);
for (Set<Element> c : clusterSet) {
double aTags = 0, largeATags = 1;
int textSize = 0;
for (Element elem : c) {
textSize += elem.text().length();
if (elem.tagName().equals("a")) {
String t = elem.text();
/* if (t.length() > 30) {
largeATags++;
} else */if (t.split(" ").length > 3) {
aTags++;
}
} else {
Elements aElems = elem.children();
for (Element aElem : aElems) {
if (aElem.tagName().equals("a")) {//get inner <a> elements
String t = aElem.text();
/* if (aElem.text().length() > 30) {
largeATags++;
} else */if (t.split(" ").length > 3) {
aTags++;
}
}
}
}
}
double score = textSize / (aTags == 0 ? 1 : aTags);
if (maxScore < score) {
maxScore = score;
bestCluster = c;
}
}
if (bestCluster != null) {
StringBuilder niceTextBuffer = new StringBuilder();
for (Element element : bestCluster) {
niceTextBuffer.append(element.text()).append("\n");
}
text = niceTextBuffer.toString();
}
}
return text;
}
private Element articleFinder(Element bodyElement) {
Element ae = null;
int maxSize = 0;
Elements elems = bodyElement.select("[itemprop=articleBody]");
if (elems.size() > 0) {
return elems.get(0);
}
for (Element elem : bodyElement.getAllElements()) {
String tag = elem.tagName().toLowerCase();
int textLength = elem.text().length();
if (ARTICLE_NODES.matcher(tag).matches()) {
if (textLength > maxSize) {
maxSize = textLength;
ae = elem;
}
} else {
Set<String> idclsSet = new HashSet<>();
String id = elem.id();
if (!id.isEmpty()) {
idclsSet.add(id);
}
String cls = elem.className();
if (cls.contains(" ")) {
idclsSet.addAll(Arrays.asList(cls.split(" ")));
} else if (!cls.isEmpty()) {
idclsSet.add(cls);
}
for (String idcls : idclsSet) {
if (CUSTOM_CLASSES_IDS.matcher(idcls).matches() || CUSTOM_CLASSES_IDS.matcher(idcls).matches()) {
return elem;
} else if (MAIN_CLASSES_IDS.matcher(idcls).matches() || MAIN_CLASSES_IDS.matcher(idcls).matches()) {
if (textLength > maxSize) {
maxSize = textLength;
ae = elem;
}
}
}
}
}
return ae;
}
private List<Set<Element>> findClusters(Elements elements) {
int nullCounter = 0;
List<Set<Element>> clusters = new LinkedList<>();
Set<Element> htmlElements = null;
for (Element element : elements) {
if (element != null && !checkChildren(element) && (element.isBlock() || POSSIBLE_TEXT_NODES.matcher(element.tagName()).matches())) {
if (htmlElements != null) {
htmlElements.add(element);
} else {
htmlElements = new LinkedHashSet<>();
htmlElements.add(element);
}
nullCounter = 0;
} else if (element == null && htmlElements != null && htmlElements.size() > 0) {
nullCounter++;
}
if (nullCounter == CL_DIST) {
clusters.add(htmlElements);
htmlElements = null;
nullCounter = 0;
}
}
// below condition handles the case when there is only one cluster without any null elements, e.g. first big chunk of the text
// it also handles the last cluster of text when nulls are less than CL_DIST
if (htmlElements != null && (clusters.size() == 0 || nullCounter < CL_DIST)) {
clusters.add(htmlElements);
}
return clusters;
}
private boolean checkChildren(Element element) {
boolean bad = false;
int at = 0;
for (Element children : element.children()) {
if (children.tagName().equals("a")) {
at += children.text().length();
}
}
double ratio = at / (double) (element.ownText().length() + 1);
if (ratio > 0.8) {
bad = true;
}
return bad;
}
private Set<Element> flattenDOM(Element bodyElement) {
final Set<Element> flatDOM = new LinkedHashSet<>();
bodyElement.traverse(new NodeVisitor() {
private int parentTextSize = 0;
@Override
public void head(Node node, int depth) {
if (node instanceof Element) {
Element innerElement = (Element) node;
Element parentElement = innerElement.parent();
if (parentElement != null) {
parentTextSize = parentElement.ownText().length();
}
//if ((innerElement.isBlock() || POSSIBLE_TEXT_NODES.matcher(innerElement.tagName()).matches())&& innerElement.text().length()>50) {
if (innerElement.ownText().length() >= WORDS_T && parentTextSize == 0) {
flatDOM.add(innerElement);
}
}
}
@Override
public void tail(Node node, int depth) {
//System.out.println("Exiting tag: " + node.nodeName());
}
});
return flatDOM;
}
public Elements calculateBlockSizeRatios(Elements mainElements) {
Map<Integer, Double> sizeMap = calculateSize(mainElements);
Map<Integer, Double> k = findMaxAndAvg(sizeMap.values());
int sizeOfMap = sizeMap.size();
Set<Integer> keySet = sizeMap.keySet();
int maxIndex = 0;
for (Integer j : k.keySet()) {
maxIndex = j;
}
for (Integer key : keySet) {
sizeMap.put(key, sizeMap.get(key) / k.get(maxIndex));
}
for (int i = 0; i < sizeOfMap; i++) {
if (sizeMap.get(i) < DEFAULT_RATIO) {
mainElements.set(i, null);
}
}
return mainElements;
}
private Map<Integer, Double> findMaxAndAvg(Collection<Double> values) {
double max = 0;
int maxIndex = 0;
Map<Integer, Double> maxElement = new HashMap<>();
Object[] valuesArr = values.toArray();
double total = 0.0;
for (int i = 0; i < valuesArr.length; i++) {
if (max <= (Double) valuesArr[i]) {
max = (Double) valuesArr[i];
maxIndex = i;
}
total = total + (Double) valuesArr[i];
}
maxElement.put(maxIndex, max);
return maxElement;
}
private Map<Integer, Double> calculateSize(Elements elements) {
Map<Integer, Double> sizeMap = new LinkedHashMap<>();
for (int i = 0; i < elements.size(); i++) {
sizeMap.put(i, (double) elements.get(i).text().length());
}
return sizeMap;
}
private void removeFat(Document doc) {
//String[] commonLinks = new String[] {"subscribe",""}
for (String UNWRAP_TAG : UNWRAP_TAGS) {
doc.select(UNWRAP_TAG).unwrap();
}
for (Element element : doc.body().getElementsByTag("br")) {
if (element != null && element.tagName().equalsIgnoreCase("br")) {
element.replaceWith(new TextNode("\n", null));
}
}
for (Element element : doc.body().getAllElements()) {
String tagName = element.tagName();
if (tagName.equalsIgnoreCase("script") || tagName.equalsIgnoreCase("noscript") || tagName.equalsIgnoreCase("style")) {
element.remove();
} else if (tagName.equalsIgnoreCase("a")) {
if (element.text().length() > 40) {
element.remove();
} else if (!POSSIBLE_TEXT_NODES.matcher(element.parent().tagName()).matches() || element.parent().ownText().length() == 0) {
element.remove();
}
} else if (element.text().length() < WORDS_T) {
element.remove();
} else if (element.ownText().split("\\|").length > 3 ) {
element.remove();
} /*else if (element.ownText().contains("...")) {
element.remove();
}*/
}
}
}