/* * Copyright 2012 Brendan McCarthy (brendan@oddsoftware.net) * * This file is part of Feedscribe. * * Feedscribe is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3 * as published by the Free Software Foundation. * * Feedscribe is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Feedscribe. If not, see <http://www.gnu.org/licenses/>. */ package net.oddsoftware.android.html; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.oddsoftware.android.feedscribe.Globals; import org.htmlcleaner.BrowserCompactXmlSerializer; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.CleanerTransformations; import org.htmlcleaner.ContentNode; import org.htmlcleaner.DoctypeToken; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.HtmlNode; import org.htmlcleaner.TagNode; import org.htmlcleaner.TagNodeVisitor; import org.htmlcleaner.TagTransformation; import org.htmlcleaner.XPatherException; import android.util.Log; public class Textify { protected String mTitle = null; private String mAuthor = null; private String mPubDate = null; protected HashMap<TagNode, Integer> mTagScores; protected HashMap<String, HashSet<String> > mAllowedAttributes; protected HashSet<String > mBlacklistTags; protected HashMap<String, String> mConvertTags; protected Pattern mUnlikelyCandidates; protected Pattern mOkMaybeItsACandidates; protected Pattern mDivToPElements; protected Pattern mPositiveClassNames; protected Pattern mNegativeClassNames; protected Pattern mSentencePattern; protected HtmlCleaner mCleaner; protected TagNode mArticleRoot; protected boolean mProcessingEnabled; private boolean mStripUnlikelyCandidates = true; private int mBestScore; private boolean mForcePageWidth = false; private String mViewport = null; public Textify() { mUnlikelyCandidates = Pattern.compile("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|" + "shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter", Pattern.CASE_INSENSITIVE); // header is in here for espnf1 and it should be fine since the header will be boring by other means mOkMaybeItsACandidates = Pattern.compile("and|article|body|column|main|shadow|commentary-panel|header", Pattern.CASE_INSENSITIVE); mDivToPElements = Pattern.compile("a|blockquote|dl|div|img|ol|p|pre|table|ul", Pattern.CASE_INSENSITIVE); mPositiveClassNames = Pattern.compile("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story|stry|" + "datetools", Pattern.CASE_INSENSITIVE); mNegativeClassNames = Pattern.compile("combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|" + "promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|contentheading", Pattern.CASE_INSENSITIVE); mSentencePattern = Pattern.compile("\\.( |$)"); mAllowedAttributes = new HashMap<String, HashSet<String> >(); HashSet<String> emptySet = new HashSet<String>(); mAllowedAttributes.put("p", emptySet ); mAllowedAttributes.put("div", emptySet ); mAllowedAttributes.put("br", emptySet ); mAllowedAttributes.put("b", emptySet ); mAllowedAttributes.put("big", emptySet ); mAllowedAttributes.put("center", emptySet ); mAllowedAttributes.put("code", emptySet ); mAllowedAttributes.put("cite", emptySet ); mAllowedAttributes.put("del", emptySet ); mAllowedAttributes.put("dfn", emptySet ); mAllowedAttributes.put("em", emptySet ); mAllowedAttributes.put("font", emptySet ); // color, face, size mAllowedAttributes.put("u", emptySet ); mAllowedAttributes.put("i", emptySet ); mAllowedAttributes.put("ins", emptySet ); mAllowedAttributes.put("kbd", emptySet ); mAllowedAttributes.put("pre", emptySet ); mAllowedAttributes.put("s", emptySet ); mAllowedAttributes.put("samp", emptySet ); mAllowedAttributes.put("small", emptySet ); mAllowedAttributes.put("strong", emptySet ); mAllowedAttributes.put("sub", emptySet ); mAllowedAttributes.put("sup", emptySet ); mAllowedAttributes.put("span", emptySet ); mAllowedAttributes.put("strike", emptySet ); mAllowedAttributes.put("tt", emptySet ); mAllowedAttributes.put("var", emptySet ); mAllowedAttributes.put("abbr", emptySet ); mAllowedAttributes.put("acronym", emptySet ); mAllowedAttributes.put("address", emptySet ); mAllowedAttributes.put("blockquote", emptySet ); mAllowedAttributes.put("q", emptySet ); mAllowedAttributes.put("wbr", emptySet ); mAllowedAttributes.put("nobr", emptySet ); mAllowedAttributes.put("xmp", emptySet ); mAllowedAttributes.put("hr", emptySet ); mAllowedAttributes.put("th", emptySet ); mAllowedAttributes.put("td", emptySet ); mAllowedAttributes.put("tr", emptySet ); mAllowedAttributes.put("thead", emptySet ); mAllowedAttributes.put("tbody", emptySet ); mAllowedAttributes.put("tfoot", emptySet ); mAllowedAttributes.put("col", emptySet ); mAllowedAttributes.put("colgroup", emptySet ); mAllowedAttributes.put("caption", emptySet ); mAllowedAttributes.put("li", emptySet ); mAllowedAttributes.put("ul", emptySet ); mAllowedAttributes.put("ol", emptySet ); mAllowedAttributes.put("dd", emptySet ); mAllowedAttributes.put("dl", emptySet ); mAllowedAttributes.put("dt", emptySet ); mAllowedAttributes.put("menu", emptySet ); mAllowedAttributes.put("dir", emptySet ); mAllowedAttributes.put("a", new HashSet<String>(Arrays.asList("href") ) ); mAllowedAttributes.put("img", new HashSet<String>(Arrays.asList("alt", "src", "title", "width", "height", "align", "usemap") ) ); mAllowedAttributes.put("bdo", new HashSet<String>(Arrays.asList( "dir" ) ) ); mAllowedAttributes.put("map", new HashSet<String>(Arrays.asList("name") ) ); mAllowedAttributes.put("area", new HashSet<String>(Arrays.asList("shape", "coords", "href", "alt") ) ); mAllowedAttributes.put("table", new HashSet<String>(Arrays.asList( "width", "cellspacing", "cellpadding", "border", "align" ) ) ); mConvertTags = new HashMap<String, String>(); mConvertTags.put( "blink", "span" ); mConvertTags.put( "marquee", "span" ); mBlacklistTags = new HashSet<String>(); mBlacklistTags.add( "meta" ); mBlacklistTags.add( "link" ); mBlacklistTags.add( "style" ); mBlacklistTags.add( "bgsound" ); mBlacklistTags.add( "base" ); mBlacklistTags.add( "object" ); mBlacklistTags.add( "applet" ); mBlacklistTags.add( "param" ); mBlacklistTags.add( "script" ); mBlacklistTags.add( "noscript" ); mBlacklistTags.add( "basefont" ); mBlacklistTags.add( "comment" ); mBlacklistTags.add( "server" ); mBlacklistTags.add( "iframe" ); mBlacklistTags.add( "embed" ); mBlacklistTags.add( "form" ); mBlacklistTags.add( "input" ); mBlacklistTags.add( "option" ); mBlacklistTags.add( "textarea" ); mBlacklistTags.add( "select" ); mBlacklistTags.add( "optgroup" ); mBlacklistTags.add( "button" ); mBlacklistTags.add( "label" ); mBlacklistTags.add( "fieldset" ); mBlacklistTags.add( "legend" ); mBlacklistTags.add( "isindex" ); mCleaner = new HtmlCleaner(); CleanerProperties props = mCleaner.getProperties(); props.setOmitComments(true); props.setOmitDoctypeDeclaration(false); props.setTransResCharsToNCR(true); props.setPruneTags("script"); props.setUseEmptyElementTags(false); CleanerTransformations transformations = new CleanerTransformations(); transformations.addTransformation( new TagTransformation("noscript", "div") ); mCleaner.setTransformations(transformations); mProcessingEnabled = true; } public void setProcessingEnabled(boolean processingEnabled) { mProcessingEnabled = processingEnabled; } public void setStripUnlikelyCandidates(boolean stripUnlikelyCandidates) { mStripUnlikelyCandidates = stripUnlikelyCandidates; } public void setViewport(String viewport) { mViewport = viewport; } public int getArticleScore() { return mBestScore; } public void process(String input) { TagNode root = mCleaner.clean( input ); mBestScore = 0; mTagScores = new HashMap<TagNode, Integer>(); if( mTitle == null ) { mTitle = getArticleTitle( root ); } if( mProcessingEnabled ) { preprocess(root); TagNode articleRoot = getArticleRoot( root ); // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" ); cleanArticle( articleRoot ); // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" ); mArticleRoot = articleRoot; } else { mArticleRoot = root.findElementByName("body", false); if(mArticleRoot == null) { mArticleRoot = root; } } } public void process(InputStream input) throws IOException { TagNode root = mCleaner.clean( input ); mTagScores = new HashMap<TagNode, Integer>(); if( mTitle == null ) { mTitle = getArticleTitle( root ); } if( mProcessingEnabled ) { preprocess(root); TagNode articleRoot = getArticleRoot( root ); // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" ); cleanArticle( articleRoot ); // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" ); mArticleRoot = articleRoot; } else { mArticleRoot = root.findElementByName("body", false); if(mArticleRoot == null) { mArticleRoot = root; } } } public void setTitle(String title) { mTitle = title; } public void setAuthor(String author) { mAuthor = author; } public void setPubDate(String pubDate) { mPubDate = pubDate; } public String getTitle() { return mTitle; } /* private ArrayList<TagNode> findNextPageLink() { return null; } private String findBaseUrl(String url) { return url; } */ private double getLinkDensity(TagNode node) { TagNode[] links = node.getElementsByName("a", true); int textLength = countChars(node.getText()); int linkLength = 0; for( int i = 0; i < links.length; ++i ) { linkLength += countChars(links[i].getText()); } if( textLength == 0 && links.length > 0 ) { return 1.0; } else if (links.length == 0 ) { return 0.0; } else { return linkLength / (double) textLength; } } private int countChars(StringBuffer buffer) { int count = 0; for( int i = 0; i < buffer.length(); ++i) { if(! Character.isWhitespace( buffer.charAt(i))) { ++count; } } return count; } /** * * Find the article title node and process it * then find the first h1 and process it * * @param root * @return */ private String getArticleTitle( TagNode root ) { String title = ""; String originalTitle = ""; Pattern p2 = Pattern.compile(":(.*)"); // find the <title> tag and process it try { Object[] titles = root.evaluateXPath("/head/title"); if( titles.length > 0 && titles[0] instanceof TagNode) { title = ((TagNode)titles[0]).getText().toString(); originalTitle = title; } } catch(XPatherException exc) { if( Globals.LOGGING ) Log.e(Globals.LOG_TAG, "getArticleTitle:", exc); } // see if there is any "arttitle" tag - wtf is this ? TagNode artTitle = root.findElementByName("arttitle", true); // see if title is of the form "title - fred news" or "title | joe news" String[] parts = title.split("\\s+[|-]"); // find the longest part String longest = null; for( int i = 0; i < parts.length; ++i) { if( longest == null || parts[i].length() > longest.length()) { longest = parts[i]; } } // override the above in the case of only 2 parts, use the first one if( parts.length == 2) { longest = parts[0]; } // System.out.println("looking for title, have " + title + " and " + originalTitle + " and " + longest + " of " + parts.length); Matcher matcher; if( artTitle != null ) { title = artTitle.getText().toString(); } else if( parts.length >= 1 && longest != null ) { title = longest; } // see if title is of the form "foo news: title" else if( (matcher = p2.matcher(title)).find() ) { title = matcher.group(1); } else { // System.out.println("no title match"); } // check title length, then try and find the one and only h1 if( title.length() > 150 || title.length() < 15 ) { // System.out.println("title '" + title + "' is of wrong size " + title.length()); TagNode[] h1s = root.getElementsByName("h1", true); if( h1s.length == 1) { title = h1s[0].getText().toString(); // System.out.println("replacing title title of wrong size with '" + title + "'"); } } title = title.trim(); // if it has less than 4 words, use the original, unprocessed title if( title.split(" ").length < 4) { // System.out.println(" title '" + title + "' is too small, using '" + originalTitle + "' instead" ); title = originalTitle; } return title; } private int getClassWeight(TagNode node) { int weight = 0; String nodeClass = node.getAttributeByName("class"); String nodeId = node.getAttributeByName("id"); // Look for a special classname if( nodeClass != null ) { if( mNegativeClassNames.matcher( nodeClass ).find() ) { weight -= 25; } if( mPositiveClassNames.matcher( nodeClass ).find() ) { weight += 25; } } if( nodeId != null ) { if( mNegativeClassNames.matcher( nodeId ).find() ) { weight -= 25; } if( mPositiveClassNames.matcher( nodeId ).find() ) { weight += 25; } } return weight; } private void addScore(TagNode node, int score) { Integer integer = mTagScores.get(node); if( integer != null ) { score += integer; } else { String name = node.getName(); if( name.equals("div") ) { score += 5; } else if ( name.equals("pre") || name.equals("td") || name.equals("blockquote") ) { score += 3; } else if ( name.equals("address") || name.equals("ol") || name.equals("ul") || name.equals("dl") || name.equals("dd") || name.equals("dt") || name.equals("li") || name.equals("form") ) { score += -3; } else if ( name.equals("th") || name.equals("h1") || name.equals("h2") || name.equals("h3") || name.equals("h4") || name.equals("h5") || name.equals("h6") ) { score += -5; } score += getClassWeight(node); } mTagScores.put(node, score); } private TagNode getArticleRoot( TagNode root ) { TagNode[] allElements = root.getAllElements(true); ArrayList<TagNode> nodesToScore = new ArrayList<TagNode>(); for( int i = 0; i < allElements.length; ++i) { TagNode currentNode = allElements[i]; String tagName = currentNode.getName(); if( mStripUnlikelyCandidates ) { String unlikelyMatchString = "" + currentNode.getAttributeByName("id") + currentNode.getAttributeByName("class"); // see if we are still in the tree TagNode parent = currentNode; while( parent != null && parent != root ) { parent = parent.getParent(); } if( parent != root ) { continue; } // String unlikelyMatchString = "" + currentNode.getAttributeByName("id") + currentNode.getAttributeByName("class"); // System.out.println("getArticleRoot processing " + tagName + " " + unlikelyMatchString + " " + currentNode.getAttributeByName("style")); // chuck out obviously bad nodes if( mUnlikelyCandidates.matcher( unlikelyMatchString ).find() && !mOkMaybeItsACandidates.matcher( unlikelyMatchString ).find() && !tagName.equals("body") ) { currentNode.removeFromTree(); continue; } } if( tagName.equals("p") || tagName.equals("td") || tagName.equals("pre") ) { nodesToScore.add(currentNode); } // score the li directly if it has nothing interesting inside it if( tagName.equals("li") ) { TagNode[] children = currentNode.getAllElements(true); boolean scoreNode = true; for( int j = 0; j < children.length; ++j) { String childName = children[j].getName(); if( mDivToPElements.matcher( childName ).find() ) { scoreNode = false; break; } } if (scoreNode) { nodesToScore.add(currentNode); } } if( tagName.equals("div") ) { TagNode[] children = currentNode.getAllElements(true); boolean convertNode = true; for( int j = 0; j < children.length; ++j) { String childName = children[j].getName(); if( mDivToPElements.matcher( childName ).find() ) { convertNode = false; break; } } if (convertNode) { // System.out.println("coverted div oops" + mCleaner.getInnerHtml(currentNode)); currentNode.setName("p"); nodesToScore.add(currentNode); } } } // once we get to here, nodesToScore contains everything we want to process for content StringBuilder innerText = new StringBuilder(); for( int i = 0; i < nodesToScore.size(); ++i) { TagNode node = nodesToScore.get(i); // System.out.println( node.getName() + " " + node.getText() ); TagNode parentNode = node.getParent(); if( parentNode == null) { continue; } TagNode grandParentNode = parentNode.getParent(); if( node.getTextLength() < 25 ) { continue; } innerText.setLength(0); node.getText(innerText); int contentScore = 0; /* Add a point for the paragraph itself as a base. */ contentScore++; contentScore += getCharCount( innerText, ","); /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ contentScore += Math.min(innerText.length() / 100, 3); /* try link density - -3 if the whole thing is a link */ int linkDensity = (int) (getLinkDensity(node) * -3); // System.out.println("link density is " + linkDensity); contentScore += linkDensity; // if(Globals.LOGGING) Log.d(Globals.LOG_TAG, "got score " + contentScore + "for " + node.getName() + " " + node.getAttributeByName("class") + ":" + node.getAttributeByName("id")); /* Add the score to the parent. The grandparent gets half. */ addScore(node, contentScore); addScore(parentNode, contentScore); if(grandParentNode != null) addScore(grandParentNode, contentScore / 2 ); } // lets have a look at the score TagNode[] scoreKeys = mTagScores.keySet().toArray(new TagNode[0]); TagNode bestNode = null; int bestScore = 0; for( int i = 0; i < scoreKeys.length; ++i) { TagNode node = scoreKeys[i]; int score = mTagScores.get(node); if( Globals.LOGGING ) { Log.d(Globals.LOG_TAG, "score " + node.getName() + " " + node.getAttributeByName("class") + " " + node.getAttributeByName("id") + " = " + score); } if( bestNode == null || score > bestScore) { bestScore = score; bestNode = node; } } mBestScore = bestScore; if( bestNode == null ) { bestNode = root.findElementByName("body", true); } if( bestNode == null) { return null; } if( Globals.LOGGING) Log.d(Globals.LOG_TAG, "The best score is " + bestScore + " " + bestNode.getText()); // now we are going to look at the siblings of the best node to see what the output should be int siblingScoreThreshold = (int) Math.max(10, bestScore * 0.2 ); @SuppressWarnings("rawtypes") List siblingNodes = bestNode.getParent().getChildren(); ArrayList<TagNode> outputNodes = new ArrayList<TagNode>(); StringBuilder content = new StringBuilder(); for(Object o: siblingNodes) { if ( ! ( o instanceof TagNode ) ) { continue; } TagNode sibling = (TagNode) o; String siblingName = sibling.getName(); Integer siblingScore = mTagScores.get(sibling); boolean output = false; int bonusScore = 0; // Give a bonus if sibling nodes and top candidates have the example same class if( sibling.getAttributeByName("class") != null && sibling.getAttributeByName("class").equals(bestNode.getAttributeByName("class")) ) { bonusScore += bestScore / 5; } if( sibling == bestNode ) { output = true; } if( siblingScore != null && (siblingScore + bonusScore) >= siblingScoreThreshold ) { output = true; } else if( siblingName.equals("p")) { double linkDensity = getLinkDensity(sibling); content.setLength(0); sibling.getText(content); if( content.length() > 80 && linkDensity < 0.25 ) { output = true; } else if ( content.length() < 80 && linkDensity == 0 && mSentencePattern.matcher( content ).find() ) { output = true; } } if( output ) { if( siblingName.equals("div") || siblingName.equals("p") ) { } else { // System.out.println("forcing node type " + siblingName + " to div for output"); // the node is not a div or p, something trickier sibling.setName("div"); } outputNodes.add( sibling ); } } TagNode article = new TagNode("div"); for( int i = 0; i < outputNodes.size(); ++i ) { TagNode node = outputNodes.get(i); node.removeFromTree(); article.addChild(node); } return article; } void cleanArticle(TagNode article) { // readability.cleanStyles(articleContent); // readability.killBreaks(articleContent); // TODO - remove consecutive <br> <br> <br> // System.out.println("starting to clean article" + mCleaner.getInnerHtml(article)); /* Clean out junk from the article */ cleanConditionally(article, "form"); clean(article, "object"); clean(article, "h1"); /** * If there is only one h2, they are probably using it * as a header and not a subheader, so remove it since we already have a header. ***/ if(article.getElementListByName("h2", true).size() == 1) { clean(article, "h2"); } clean(article, "iframe"); cleanHeaders(article); /* Do these last as the previous stuff may have removed junk that will affect these */ cleanConditionally(article, "table"); cleanConditionally(article, "ul"); cleanConditionally(article, "div"); /* Remove extra paragraphs */ TagNode[] articleParagraphs = article.getElementsByName("p", true); for(int i = 0; i < articleParagraphs.length; ++i) { TagNode p = articleParagraphs[i]; int imgCount = p.getElementListByName("img", true).size(); int embedCount = p.getElementListByName("embed", true).size(); int objectCount = p.getElementListByName("object", true).size(); if (imgCount == 0 && embedCount == 0 && objectCount == 0 && p.getTextLength() == 0) { p.removeFromTree(); } } // TODO - replace <br><p> with <p> ? // articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); } void clean(TagNode root, String tag) { // TODO - if tag is object or embed, check for youtubeage TagNode[] nodes = root.getElementsByName(tag, true); for( int i = 0; i < nodes.length; ++i) { nodes[i].removeFromTree(); } } /** * remove h1 and h2 tags if they are junk * @param root */ void cleanHeaders(TagNode root) { for (int headerIndex = 1; headerIndex < 3; headerIndex++) { TagNode[] headers = root.getElementsByName("h" + headerIndex, true); for (int i = 0; i < headers.length; ++i) { if (getClassWeight(headers[i]) < 0 || getLinkDensity(headers[i]) > 0.33) { headers[i].removeFromTree(); } } } } void cleanConditionally(TagNode root, String tag) { TagNode[] tags = root.getElementsByName(tag, true); StringBuilder nodeText = new StringBuilder(); for( int i = 0; i < tags.length; ++i) { TagNode node = tags[i]; int weight = getClassWeight(node); int contentScore = 0; Integer tmp = mTagScores.get(node); if( tmp != null ) { contentScore = tmp; } // System.out.println("Cleaning Conditionally " + node.getName() + " (" + node.getAttributeByName("class") + ":" + node.getAttributeByName("id") + ")" + contentScore); nodeText.setLength(0); node.getText( nodeText ); if(weight+contentScore < 0) { // System.out.println("removing because of weight " + (weight + contentScore)); node.removeFromTree(); } else if ( getCharCount(nodeText, ",") < 10) { /** * If there are not very many commas, and the number of * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. **/ int p = node.getElementListByName("p", true).size(); int img = node.getElementListByName("img", true).size(); int li = node.getElementListByName("li", true).size()-100; int input = node.getElementListByName("input", true).size(); int embed = node.getElementListByName("embed", true).size(); // TODO - look for youtubes and keep them double linkDensity = getLinkDensity( node ); int contentLength = nodeText.length(); boolean toRemove = false; boolean forceKeep = false; // System.out.println("p " + p + " img " + img + " weight " + weight + " linkDensity " + linkDensity + " embed " + embed + " content length " + contentLength + node.getText()); if ( img > p && img > 1 ) // TODO - added this img > 1 check, see how bad this makes things { // System.out.println("removing 1"); toRemove = true; } else if( li > p && ! tag.equals("ul") && ! tag.equals("ol") ) { // System.out.println("removing 2"); toRemove = true; } else if( input > p/3 ) { // System.out.println("removing 3"); toRemove = true; } else if(contentLength < 25 && (img == 0 || img > 2) ) { // System.out.println("removing 4"); toRemove = true; } else if(weight < 25 && linkDensity > 0.2) { // System.out.println("removing 5"); toRemove = true; } else if(weight >= 25 && linkDensity > 0.5) { // System.out.println("removing 6"); toRemove = true; } else if((embed == 1 && contentLength < 75) || embed > 1) { // System.out.println("removing 7"); toRemove = true; } // TODO - dirty hack for bbc image galleries if( "galMain".equals(node.getAttributeByName("class"))) { forceKeep = true; } if (toRemove && !forceKeep) { // System.out.println("removing"); node.removeFromTree(); } else { // System.out.println("stays, hooray"); } } else { // System.out.println("stays, heaps of commas"); } } } private int getCharCount(StringBuilder text, String c) { int count = 0; /* Add points for any commas within this paragraph */ int index = -1; while( ( index = text.indexOf(c, index+1) ) != -1 ) { ++count; } return count; } public String getProcessedArticle() { TagNode html = new TagNode("html"); TagNode head = new TagNode("head"); TagNode title = new TagNode("title"); title.addChild( new ContentNode(mTitle) ); head.addChild( title ); html.addChild( head ); if( mViewport != null ) { TagNode viewport = new TagNode("meta"); viewport.setAttribute("name", "viewport"); viewport.setAttribute("content", mViewport); head.addChild(viewport); } TagNode styleSheetNode = new TagNode("style"); styleSheetNode.setAttribute("type", "text/css"); styleSheetNode.addChild(new ContentNode("" + "DIV { font-family: sans-serif }" + "P { font-family: sans-serif }" + "H1 { text-align: center; font-family: serif }" + "IMG { max-width: 100% ; height: auto } " ) ); head.addChild(styleSheetNode); if( mForcePageWidth ) { TagNode viewportNode = new TagNode("meta"); viewportNode.setAttribute("name", "viewport"); viewportNode.setAttribute("content", "width=device-width"); head.addChild(viewportNode); } if( mArticleRoot.getName().equals("body")) { mArticleRoot.setName("div"); } TagNode body = new TagNode("body"); html.addChild( body ); TagNode content = new TagNode("div"); // TODO - set class TagNode header = new TagNode("h1"); header.addChild( new ContentNode( mTitle) ); content.addChild(header); if( mAuthor != null && mAuthor.length() > 0 ) { header = new TagNode("div"); header.addChild( new ContentNode( mAuthor ) ); content.addChild(header); } if( mPubDate != null && mPubDate.length() > 0 ) { header = new TagNode("div"); header.addChild( new ContentNode( mPubDate ) ); content.addChild(header); } content.addChild(new TagNode("hr")); content.addChild( mArticleRoot ); body.addChild(content); html.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd")); mArticleRoot.traverse(new TagNodeVisitor() { public boolean visit(TagNode tagNode, org.htmlcleaner.HtmlNode htmlNode) { if (htmlNode instanceof TagNode) { TagNode tag = (TagNode) htmlNode; String tagName = tag.getName(); if( mBlacklistTags.contains(tagName) ) { tag.removeFromTree(); } else if (mConvertTags.containsKey(tagName)) { tagName = mConvertTags.get(tagName); tag.setName(tagName); } Set<String> allowedAttributes = mAllowedAttributes.get( tagName ); // convert unknown tags to div if( allowedAttributes == null ) { tagName = "div"; tag.setName(tagName); tag.getAttributes().clear(); } else { Map<String,String> attributesMap = tag.getAttributes(); String[] attributeNames = attributesMap.keySet().toArray(new String[0]); for( int i = 0; i < attributeNames.length; ++i ) { String name = attributeNames[i]; if( ! allowedAttributes.contains( name ) ) { attributesMap.remove(name); } } } } // tells visitor to continue traversing the DOM tree return true; } }); try { return new BrowserCompactXmlSerializer( mCleaner.getProperties() ).getAsString( html ); } catch( IOException exc ) { return ""; } } /** * replace all <br><br> in the body with </p><p> for better parsing * TODO - see if there are any frames any maybe use those instead, readability looks at how big the frames are on the screen * * @param root */ void preprocess(TagNode root) { TagNode body = root.findElementByName("body", true); final ArrayList<Object> contentToTransform = new ArrayList<Object>(); final ArrayList<TagNode> brsToRemove = new ArrayList<TagNode>(); if( body != null ) { // traverse whole DOM and update images to absolute URLs body.traverse(new TagNodeVisitor() { public boolean visit(TagNode tagNode, HtmlNode htmlNode) { if (htmlNode instanceof TagNode) { TagNode tag = (TagNode) htmlNode; // grab all child nodes and see if there are any duplicate <br><br> tags Object[] children = tag.getChildren().toArray(); contentToTransform.clear(); brsToRemove.clear(); int brCount = 0; boolean hasDoneTransform = false; // so we are scanning for <br>(whitespace)<br>(content) for( int i = 0; i < children.length + 1; ++i ) { Object child = null; if( i < children.length ) { child = children[i]; } boolean isBr = false; boolean doTransform = false; if( child instanceof TagNode ) { TagNode childTag = (TagNode) child; if( childTag.getName().equals("br") ) { isBr = true; brsToRemove.add(childTag); // we have 2 brs previously and then some content, now this br, time to transform if( brCount > 1 && contentToTransform.size() > 0 ) { doTransform = true; } brCount += 1; } } // if we have found a br previously, see if the next node is junk, if so we keep scanning // otherwise we reset if( brCount == 1 && !isBr && child != null) { boolean reset = true; if( child instanceof ContentNode ) { ContentNode childContent = (ContentNode) child; String content = childContent.getContent().toString().trim(); if( content.length() == 0) { reset = false; } } if( reset ) { // System.out.println("resetting on " + child + ":" + child.getClass()); brCount = 0; hasDoneTransform = false; contentToTransform.clear(); brsToRemove.clear(); } } if( brCount > 0 && !isBr && child != null) { // System.out.println("Appending tranformation candidate " + contentToTransform.size()); contentToTransform.add(child); } if( brCount > 1 && child == null && hasDoneTransform) { doTransform = true; } if( doTransform ) { // System.out.println("PerformingTransform"); hasDoneTransform = true; TagNode newParagraph = null; if( brsToRemove.size() > 0 ) { newParagraph = brsToRemove.get(0); } else // this is the last paragraph, make a new one { newParagraph = new TagNode("p"); tag.addChild(newParagraph); hasDoneTransform = false; } newParagraph.setName("p"); // turn silly old <br> into a shiny <p> to stick the new content under for(int j = 0; j < contentToTransform.size(); ++j) { Object transformChild = contentToTransform.get(j); if( transformChild instanceof TagNode ) { ((TagNode) transformChild).removeFromTree(); } tag.removeChild(transformChild); newParagraph.addChild( transformChild ); } for( int j = 1; j < brsToRemove.size() - 1; ++j ) { brsToRemove.get(j).removeFromTree(); } // System.out.println("new content is " + mCleaner.getInnerHtml(newParagraph)); contentToTransform.clear(); TagNode savedBr = brsToRemove.get( brsToRemove.size() - 1); brsToRemove.clear(); brsToRemove.add(savedBr); brCount = 2; // initilise in state as if we have previously seen some brs so consective paragraphs get transformed } } // System.out.println("finished processing children"); contentToTransform.clear(); brsToRemove.clear(); } // tells visitor to continue traversing the DOM tree return true; } }); } } }