package ecologylab.bigsemantics.html.documentstructure; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Set; import java.util.TreeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import ecologylab.bigsemantics.documentparsers.HTMLDOMParser; import ecologylab.bigsemantics.html.DOMParserInterface; import ecologylab.bigsemantics.html.DOMWalkInformationTagger; import ecologylab.bigsemantics.html.HTMLElementDOM; import ecologylab.bigsemantics.html.ImgElement; import ecologylab.bigsemantics.html.ParagraphText; import ecologylab.bigsemantics.html.utils.HTMLNames; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.bigsemantics.model.text.TermVector; import ecologylab.generic.Debug; import ecologylab.generic.IntSlot; import ecologylab.generic.StringTools; import ecologylab.net.ParsedURL; import ecologylab.serialization.XMLTools; /** * We recognize web pages to index page, index-content page, content page, image-collection page, and low-quality page * to work in forming surrogates. * * The image-collection page has lots of images with very few text. Those images may link to the Image URL or not link to anything else, * so we are looking at the page mime type. * * In the index-content page, images are formed as surrogates if it is linked to the document, and there is informative text nearby * that image. * * @author eunyee * @author andruid */ public class RecognizedDocumentStructure extends Debug implements HTMLNames { static final int PARAGRAPH_COUNT_MINI_ARTICLE_THRESHOLD = 2; static final int PARAGRAPH_COUNT_ARTICLE_THRESHOLD = 5; static final int CHAR_COUNT_ARTICLE_THRESHOLD = 300; ParsedURL purl; public RecognizedDocumentStructure(ParsedURL purl) { this.purl = purl; } /** * This is the case there is no article main, which means high probability to be an index page. * Needs to author informative image and text surrogate in the whole document itself. * * @param articleMain * @param imgNodes * @param totalTxtLeng * @param paraTexts * @param htmlType */ public void generateSurrogates(Node articleMain, ArrayList<ImgElement> imgNodes, int totalTxtLeng, TreeMap<Integer, ParagraphText> paraTexts, DOMParserInterface htmlType) { constructImgSurrogatesForOtherPages( imgNodes, totalTxtLeng, htmlType ); htmlType.setIndexPage(); } /** * Check whether there is an article part in the current document. * Returns the article part if found. * * @param taggedDoc * @return */ public static Node recognizeContentBody(DOMWalkInformationTagger taggedDoc) { Node grandParent = null; Node ggParent = null; /* * grandParent node with the count information. * */ HashMap<Node, IntSlot> grandParentChildCounts = new HashMap<Node, IntSlot>(); /* * grand-grandParent node with the count information. */ HashMap<Node, IntSlot> greatGrandParentChildCounts = new HashMap<Node, IntSlot>(); // get linearlized from TreeMap Collection<ParagraphText> paragrphTextsValues = taggedDoc.getParagraphTextsTMap().values(); for(ParagraphText pt : paragrphTextsValues) { Node parent = pt.getNode().getParentNode(); grandParent = parent.getParentNode(); ggParent = grandParent.getParentNode(); // FIXME: Refactor the below method //identify common grandParent if( grandParentChildCounts.containsKey(grandParent) ) { IntSlot numProgenySoFar = grandParentChildCounts.get(grandParent); numProgenySoFar.value++; } else grandParentChildCounts.put(grandParent, new IntSlot(1)); //identify common great grandParent if( greatGrandParentChildCounts.containsKey(ggParent) ) { IntSlot numProgenySoFar = greatGrandParentChildCounts.get(ggParent); numProgenySoFar.value++; } else greatGrandParentChildCounts.put(ggParent, new IntSlot(1)); } // Object[] paragraphTextsArray = paragrphTextsValues.toArray(); Node articleMainNode = findArticleMainNode(taggedDoc, grandParentChildCounts /*, paragraphTextsArray */); // if no common grandParent, look for common greatGrandParent. if( articleMainNode == null ) { articleMainNode = findArticleMainNode(taggedDoc, greatGrandParentChildCounts /*, paragraphTextsArray */); } return articleMainNode; } /** * identify article sub-tree by locating common ancestor. * (Eunyee's dissertation algorithm) * * @param taggedDoc * @param ancestorChildCounts * @param paragraphTextsArray * @return */ private static Node findArticleMainNode(DOMWalkInformationTagger taggedDoc, HashMap<Node, IntSlot> ancestorChildCounts /* , Object[] paragraphTextsArray */) { Node articleMainNode = null; Set<Node> grandParents = ancestorChildCounts.keySet(); for (Node grandParentNode : grandParents) { IntSlot tint = ancestorChildCounts.get(grandParentNode); // If the majority of the paragraph nodes has the common grandParent node, // we recognize the grandParent node as an articleMain node. if( tint.value >= PARAGRAPH_COUNT_ARTICLE_THRESHOLD ) { articleMainNode = grandParentNode; break; } /* else if( tint.value >= PARAGRAPH_COUNT_MINI_ARTICLE_THRESHOLD ) { int size = pprint.paragraphTexts.size(); ParagraphText pt1 = (ParagraphText) paragraphTextsArray[size-1]; ParagraphText pt2 = (ParagraphText) paragraphTextsArray[size-2]; if( (pt1.ptext.length() > CHAR_COUNT_ARTICLE_THRESHOLD) && (pt2.ptext.length() > CHAR_COUNT_ARTICLE_THRESHOLD)) { articleMainNode = grandParentNode; break; } //else if(pt1.ptext.length()+pt2.ptext.length()>500) //{ // articleMainNode = grandParentNode; //} } */ } return articleMainNode; } /** * 1) Image should be not too small (small images are usually copyrights or icons..) * 2) Image with a link does not tend to be an article image. * It may be informative-image, but it is not an article-related image. * 3) Image ratio can sometimes catch uninformative images. * 4) Textual Features: terms in URL, Alt texts, descriptions, and nearest texts in DOM. * */ //FIXME -- andruid: exactly what sorted order are the paraTexts in? why? protected void associateImageTextSurrogates(DOMParserInterface htmlType, Node articleBody, TreeMap<Integer, ParagraphText> paraTexts) { for (ImgElement imgElement: imgNodesInContentBody) { if (imgElement.isInformativeImage()) { final Node imageNodeNode = imgElement.getNode(); informImgNodes.add(imageNodeNode); StringBuilder extractedCaption = getConnectedText(imageNodeNode, null); // returns null in worst case TermVector captionTV = null; if (extractedCaption != null) { XMLTools.unescapeXML(extractedCaption); captionTV = new TermVector(extractedCaption); } String altText = imgElement.getNonBogusAlt(); TermVector altTextTV = (altText == null) ? null : new TermVector(altText); boolean done = false; // use this instead of break to make sure we get to pt.recycle() while (!done && (paraTexts.size() > 0)) { ParagraphText pt = paraTexts.remove(paraTexts.lastKey()); // get longest remaining paragraph Node textNode = pt.getNode(); if (textNode.getParentNode().getParentNode().equals(articleBody) || textNode.getParentNode().getParentNode().getParentNode().equals(articleBody) ) { // if (pt.hasText()) // should be no longer necessary -- andruid 8/09 // { pt.unescapeXML(); boolean setAltToCaption = false; if ((captionTV != null) || (altText!=null)) { TermVector ptTV = pt.termVector(); // this is a candidate text context double captionDotTextContext = 0; if (captionTV != null) { // imageNode.setAttribute(EXTRACTED_CAPTION, StringTools.toString(extractedCaption)); captionDotTextContext = captionTV.dot(ptTV); } double altDotTextContext = 0; if (altText!=null) { altDotTextContext = altTextTV.dot(ptTV); } // check for common sharp terms between associateText and captionText if ((captionDotTextContext > 0) || (altDotTextContext > 0)) { pt.setImgElementTextContext(imgElement); if (captionDotTextContext > altDotTextContext) { imgElement.setAlt(StringTools.toString(extractedCaption)); imgElement.setExtractedCaption(pt.getBuffy().toString()); setAltToCaption = true; } done = true; } ptTV.clear(); } else { // no alt attribute or extracted caption, so use the first (longest) text context // FIXME -- should we try dot product with title?! pt.setImgElementTextContext(imgElement); done = true; } if (!setAltToCaption && (extractedCaption != null)) imgElement.setExtractedCaption(StringTools.toString(extractedCaption)); // } // if pt.hasText } // if grandParent or greatGrandParent is articleBody pt.recycle(); } // end while (!done && (paraTexts.size() > 0)) if (extractedCaption != null) { StringBuilderUtils.release(extractedCaption); captionTV.clear(); } if (altTextTV != null) altTextTV.clear(); ParsedURL anchorPurl = findAnchorPURL(imgElement); htmlType.constructImageClipping(imgElement, anchorPurl); } // if isInformImage } // for each imageNode in content body } /** * Recognize the image surrogate for the other page based on the link to the other document (checking mime type for the page) * and nearby text whether the text is informative and can be associated with the image for the image+text surrogate. * */ protected void constructImgSurrogatesForOtherPages(ArrayList<ImgElement> imgNodes, int totalTxtLeng, DOMParserInterface htmlType) { for (ImgElement imgElement : imgNodes) { if (HTMLDOMParser.isAd(imgElement.getSrc())) continue; Node imgNodeNode = imgElement.getNode(); //TODO -- can make this search for text context more comprehensive, while making sure to stay out of content body StringBuilder extractedContext = getLongestTxtinSubTree(imgNodeNode.getParentNode().getParentNode(), null); String alt = imgElement.getAlt(); // this if condition checks whether the nearest text to the image is substantial enough to form a surrogate. // TODO needs to check parent Href and Text informativity if (extractedContext != null || alt != null) { boolean useContext = extractedContext != null && (extractedContext.length()>10) && (!StringTools.contains(extractedContext, "advertis")); if ((alt != null && alt.length() > 10) || useContext) { ParsedURL anchorPurl = findAnchorPURL(imgElement); // Check whether the anchor mimetype is not an image. if( (anchorPurl!=null) && !anchorPurl.isImg() ) { // TODO!! ask whether we should add this to the associateText or not. //FIXME! -- push caption text through as StringBuilder! if (useContext) imgElement.setTextContext(extractedContext); htmlType.constructAnchorImageClipping(imgElement, anchorPurl); // htmlType.removeTheContainerFromCandidates(anchorPurl); } } StringBuilderUtils.release(extractedContext); } } } /** * Check whether the image node has the anchor url or not, if so return it as ParsedURL. * @param ina * * @return */ protected ParsedURL findAnchorPURL(HTMLElementDOM ina) { Node aNode = ina.getNode().getParentNode(); ParsedURL result= null; Node aHref = null; if ("a".equals(aNode.getNodeName())) aHref = aNode.getAttributes().getNamedItem("href"); else { aNode = aNode.getParentNode(); aHref = aNode.getAttributes().getNamedItem("href"); } if (aHref != null) { String hrefValue = aHref.getNodeValue(); hrefValue = XMLTools.unescapeXML(hrefValue); result = purl.createFromHTML(hrefValue); } return result; } /** * All the article images that determined informative. */ private ArrayList<Node> informImgNodes = new ArrayList<Node>(); /** * All the image nodes under the sub-tree of the ArticleMain node. */ protected ArrayList<ImgElement> imgNodesInContentBody = new ArrayList<ImgElement>(); /** * Finding image nodes under the content body. * * @param contentBody * @param imgNodes TODO */ public void findImgsInContentBodySubTree(Node contentBody, ArrayList<ImgElement> imgNodes) { StringBuilder buffy = StringBuilderUtils.acquire(); xpath(buffy, contentBody); String contentBodyXpath = buffy.toString(); StringBuilderUtils.release(buffy); int i = imgNodes.size(); while (--i >= 0) { ImgElement imgNode = imgNodes.get(i); String imgXpath = imgNode.xpath(); if (imgXpath.startsWith(contentBodyXpath)) { imgNodes.remove(i); imgNodesInContentBody.add(imgNode); } } } public void xpath(StringBuilder buffy, Node node) { if (node.getParentNode() != null && node.getParentNode().getNodeName() != null) xpath(buffy, node.getParentNode()); thisNodeXPath(buffy, node); } public void thisNodeXPath(StringBuilder buffy, Node node) { buffy.append('/').append(node.getNodeName()); int count = 1; Node prev = node.getPreviousSibling(); while (prev != null) { if (node.getNodeName().equals(prev.getNodeName())) count++; prev = prev.getPreviousSibling(); } if (count > 1) buffy.append('[').append(count).append(']'); } /** * Common method to find a particular html node based on nodeElementString * that adds to either hrefNodesInContentBody or imgNodesInContentBody * @param contentBody * @param nodeElementString * @param nodesInContentBody */ private void htmlNodesInContentBody(Node contentBody, String nodeElementString, ArrayList<ImgElement> nodesInContentBody) { NodeList children = contentBody.getChildNodes(); for (int i=0; i<children.getLength(); i++) { Node contentNode = children.item(i); htmlNodesInContentBody(contentNode, nodeElementString, nodesInContentBody); if( contentNode.getNodeName()!=null && contentNode.getNodeName().equals(nodeElementString) ) { ImgElement ina = new ImgElement(contentNode, purl); nodesInContentBody.add(ina); } } } public static StringBuilder getConnectedText(Node node, StringBuilder textResult) { Node grandParent = node.getParentNode().getParentNode(); StringBuilder result = getLongestTxtinSubTree(grandParent, textResult); if (result == null || result.length() > 5) result = getLongestTxtinSubTree(grandParent.getParentNode(), textResult); return result; } /** * check the texts under the DOM node that is passed as a parameter. * * @param parent node of the image node is passed in to the parameter. */ public static StringBuilder getLongestTxtinSubTree(Node blockNode, StringBuilder textResult) { NodeList children = blockNode.getChildNodes(); for (int i=0; i<children.getLength(); i++) { Node childNode = children.item(i); if( (childNode.getNodeType() != Node.TEXT_NODE) && (childNode.getNodeName()!=null) && (!childNode.getNodeName().equals("script"))) { //Recursive call with the childNode textResult = getLongestTxtinSubTree(childNode, textResult); } else if (childNode.getNodeType() == Node.TEXT_NODE ) { int curLength = (textResult == null) ? 0 : textResult.length(); textResult = StringBuilderUtils.trimAndDecodeUTF8(textResult, childNode, curLength); } } return textResult; } protected boolean checkLinkIn(Node parentNode, Node currentNode) { // System.out.println("Parent Node : " + parentNode.element + " : " + currentNode ); // System.out.println("\nCurrentNode: " + parentNode.element ); Node temp = parentNode.getFirstChild(); Node prevNode = null; while( temp != null ) { /* checkLinkIn(temp, temp); if( temp.element != null ) System.out.println("NODE:" + temp.element); */ if( (prevNode!=null) && (prevNode.getNodeName()!=null) && (prevNode.getNodeName().equals("a")) ) return true; prevNode = temp; temp = temp.getFirstChild(); } return false; } /** * Initial Implementation for PhatSurrogate Implementation. * * @param articleMain */ //TODO -- get rid of this dead code /* protected void printArticleText( TdNode articleMain )//, String paraElement) { if( articleMain!=null && (articleMain.element!=null) && !articleMain.element.equals("script") ) { TdNode temp = articleMain.content(); while( temp != null ) { //System.out.println("\n\n---------- Paragraph HTML Element : " + paraElement ); printArticleText(temp); if( temp.type==TdNode.TextNode ) //&& (temp.parent().element!=null) && temp.parent().element.equals(paraElement)) { // Print Text in ArticleMain Lexer.getString(temp.textarray(), temp.start(), temp.end()-temp.start() ); } temp = temp.next(); } } } */ public ArrayList<ImgElement> getImgNodesInContentBody() { return imgNodesInContentBody; } public ArrayList<Node> getInformImgNodes() { return informImgNodes; } @Override public String toString() { return super.toString() + "[" + purl + "]"; } }