package ecologylab.bigsemantics.documentparsers; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.TreeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import ecologylab.bigsemantics.actions.SemanticActionHandler; import ecologylab.bigsemantics.collecting.SemanticsGlobalScope; import ecologylab.bigsemantics.html.AElement; import ecologylab.bigsemantics.html.DOMParserInterface; import ecologylab.bigsemantics.html.DOMWalkInformationTagger; import ecologylab.bigsemantics.html.ImgElement; import ecologylab.bigsemantics.html.ParagraphText; import ecologylab.bigsemantics.html.documentstructure.AnchorContext; import ecologylab.bigsemantics.html.documentstructure.ContentPage; import ecologylab.bigsemantics.html.documentstructure.ImageCollectionPage; import ecologylab.bigsemantics.html.documentstructure.ImageFeatures; import ecologylab.bigsemantics.html.documentstructure.IndexPage; import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure; import ecologylab.bigsemantics.html.documentstructure.TextOnlyPage; import ecologylab.bigsemantics.html.utils.HTMLNames; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.bigsemantics.metadata.builtins.RichDocument; import ecologylab.bigsemantics.metadata.builtins.Document; import ecologylab.bigsemantics.metametadata.MetaMetadata; import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField; import ecologylab.generic.StringTools; import ecologylab.net.ParsedURL; import ecologylab.serialization.XMLTools; /** * Parse HTML, create DOM, and author Image and Text surrogates from DOM * * @author eunyee * */ public class HTMLDOMImageTextParser extends ParserBase<RichDocument> implements DOMParserInterface, HTMLNames { HashMap<Node, String> tdNodeAnchorContextStringCache; DOMWalkInformationTagger taggedDoc; @Override public Document populateMetadata(Document document, MetaMetadataCompositeField metaMetadata, org.w3c.dom.Document DOM, SemanticActionHandler handler) { recursiveExtraction(metaMetadata, document, DOM, null, handler.getSemanticActionVariableMap()); return document; } @Override public void parse() throws IOException { DOMWalkInformationTagger taggedDoc = new DOMWalkInformationTagger(getDocumentClosure().getDocument().getLocation(), this); // this function actually traverse the dom tree org.w3c.dom.Document dom = getDom(); // long t0 = System.currentTimeMillis(); taggedDoc.generateCollections(dom); Node contentBody = getContentBody(taggedDoc); DOMWalkInformationTagger taggedContentNode = walkAndTagDom(contentBody, this); extractImageTextSurrogates(taggedDoc, contentBody); //The information on whether the anchorContexts where produced by the document entirely or the ContentNode. ArrayList<AnchorContext> anchorContexts = null; //This document's purl ParsedURL purl = purl(); boolean fromContentBody = taggedContentNode != null; if (fromContentBody) anchorContexts = buildAnchorContexts(taggedContentNode.getAllAnchorNodes(), purl, true); else anchorContexts = buildAnchorContexts(taggedDoc.getAllAnchorNodes(), purl, false); generateCandidateContainersFromContexts(anchorContexts, fromContentBody); anchorContexts.clear(); taggedDoc.recycle(); taggedDoc = null; if (fromContentBody) taggedContentNode.recycle(); // getLogRecord().setMsContentBodyAndClippings(System.currentTimeMillis() - t0); MetaMetadata metaMetadata = (MetaMetadata) getMetaMetadata(); if (metaMetadata.getSemanticActions() != null || metaMetadata.hasChildren()) { // t0 = System.currentTimeMillis(); super.parse(); // getLogRecord().setMsImageTextParserCallingSuperParse(System.currentTimeMillis() - t0); } } /** * This is the walk of the dom that calls print tree, and the parser methods such as closeHref etc. * @param doc * @param htmlType * @return */ public DOMWalkInformationTagger walkAndTagDom(Node contentBody, DOMParserInterface htmlType) { // note that content body could be null if it is not a content page if (contentBody == null) return null; DOMWalkInformationTagger domTagger = new DOMWalkInformationTagger(getDocumentClosure().getDocument().getLocation(), htmlType); domTagger.generateCollectionsFromRoot(contentBody); // walk through the HTML document object. // gather all paragraphText and image objects in the data structure. //FIXME -- get rid of this call and object! // domTagger.printTree(jtidyPrettyOutput, (short)0, 0, null, tdNode); // domTagger.flushLine(jtidyPrettyOutput, 0); return domTagger; } /** * Extract Image and Text Surrogates while walk through DOM * * historically was called as pprint() in JTidy. */ public void extractImageTextSurrogates(DOMWalkInformationTagger taggedDoc, Node contentBody) { //System.out.println("\n\ncontentBody = " + contentBody); ArrayList<ImgElement> imgNodes = taggedDoc.getAllImgNodes(); recognizeDocumentStructureToGenerateSurrogate(taggedDoc, contentBody, imgNodes); } /** * @param taggedDoc * @return */ private Node getContentBody(DOMWalkInformationTagger taggedDoc) { Node contentBody = RecognizedDocumentStructure.recognizeContentBody(taggedDoc); return contentBody; } /** * Recognize the page type based on whether the page has contentBody node or not, * text length in the whole page, and whether the informative images reside in the page. * * Based on the recognized page type, it generates surrogates. * @param domWalkInfoTagger * @param contentBody * @param imgNodes */ private void recognizeDocumentStructureToGenerateSurrogate(DOMWalkInformationTagger domWalkInfoTagger, Node contentBody, ArrayList<ImgElement> imgNodes) { RecognizedDocumentStructure pageCategory = null; if ((contentBody!=null) && (contentBody.getParentNode()!=null) /*&& (!articleMain.parent().equals(document))*/ ) { pageCategory = new ContentPage(purl()); } else { final int numImgNodes = imgNodes.size(); if( (numImgNodes>0) && ((domWalkInfoTagger.getTotalTxtLength()/numImgNodes)<200) ) { // High probability to be an image-collection page pageCategory = new ImageCollectionPage(purl()); } else if ( numImgNodes!=0 ) { // Index Pages (include index-content pages) //FIXME -- should also look at text only pages & especially use link ratio as a feature!!!! pageCategory = new IndexPage(purl()); } } TreeMap<Integer, ParagraphText> paragraphTextsTMap = domWalkInfoTagger.getParagraphTextsTMap(); if (pageCategory != null) { pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, this); } // No Informative images are in this document. Form surrogate only with text. // We cannot tell whether the images in the pages are informative or not until downloading all, thus this is the case after we // look through all the images in the page and determine no image is worth displaying. if( (numExtractedClippings()==0) && (paragraphTextsTMap.size()>0) ) { pageCategory = new TextOnlyPage(purl()); pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, this); } if (pageCategory != null) setRecognizedDocumentStructure(pageCategory.getClass()); } /** * Transform an set of AElements (HTML a) into a set of AnchorContexts. * In some cases, an AElement may result in no entry, because the anchor text and anchor context are both empty. * @param anchorElements * @param sourcePurl The purl from which these AElements were extracted from. * * @return */ public ArrayList<AnchorContext> buildAnchorContexts(ArrayList<AElement> anchorElements, ParsedURL sourcePurl, boolean fromContentBody ) { ArrayList<AnchorContext> anchorNodeContexts = new ArrayList<AnchorContext>(); for (AElement aElement : anchorElements) { AnchorContext aContext= constructAnchorContext(aElement, sourcePurl, fromContentBody); if(aContext!=null) anchorNodeContexts.add(aContext); } return anchorNodeContexts; } /** * Given the a element from the HTML, get its anchor text (text between open and close a tags), * and its anchor context (surrounding text). If either of these is not null, then return an * AnchorContext object. * * The surrounding text is defined as all the text in the a element's parent node. * This definition should perhaps be expanded, for example, by trying grandparent if parent * is either null or the same as anchor text. * * @param aElement Anchor HTMLElement (a href=...) * @param fromContentBody * * @return AnchorContext object, or null. */ public AnchorContext constructAnchorContext(AElement aElement, ParsedURL sourcePurl, boolean fromContentBody) { Node anchorNodeNode = aElement.getNode(); ParsedURL href = aElement.getHref(); if (href != null) { //Cache TdNode-AnchorContext getTextInSubTree. Node parent = anchorNodeNode.getParentNode(); //FIXME -- this routine drops all sorts of significant stuff because it does not concatenate across tags. StringBuilder anchorContext = null; if(tdNodeAnchorContextStringCache == null) tdNodeAnchorContextStringCache = new HashMap<Node, String>(); String anchorContextString = tdNodeAnchorContextStringCache.get(parent); if(anchorContextString == null) { anchorContext = getTextInSubTree(parent, false); if (anchorContext != null) { anchorContextString = StringTools.unescapeAndLowerCaseStringBuilder(anchorContext); StringBuilderUtils.release(anchorContext); tdNodeAnchorContextStringCache.put(parent, anchorContextString); } } //TODO: provide ability to specify alternate anchorContext StringBuilder anchorText = getTextInSubTree(anchorNodeNode, true); if ((anchorContextString != null) || (anchorText != null)) { String anchorTextString = null; if (anchorText != null) { anchorTextString = StringTools.unescapeAndLowerCaseStringBuilder(anchorText); StringBuilderUtils.release(anchorText); } return new AnchorContext(href, anchorTextString, anchorContextString, sourcePurl, fromContentBody, false); } } return null; } public static StringBuilder getTextInSubTree(Node node, boolean recurse) { return getTextinSubTree(node, recurse, null); } /** * Non-recursive method to get the text for the <code>node</code> * Collects the text even if the node contains other nodes in between, * specifically the <code>anchor</code>. It does not however include the * text from the anchor node, as it exists in the anchorText * @param node * @param te * @return */ public static StringBuilder getTextinSubTree(Node node, boolean recurse, StringBuilder result) { NodeList children = node.getChildNodes(); for (int i=0; i<children.getLength(); i++) { Node childNode = children.item(i); if (recurse && (childNode.getNodeName()!=null) && (!childNode.getNodeName().equals("script"))) { //Recursive call with the childNode result = getTextinSubTree(childNode, true, result); } else if (childNode.getNodeType() == Node.TEXT_NODE ) { int length = 0; if (result != null) { result.append(' '); // append space to separate text chunks length = result.length(); } result = StringBuilderUtils.trimAndDecodeUTF8(result, childNode, 0, true); if ((result != null) && (length == result.length()) && (length > 0)) result.setLength(length - 1); // take the space off if nothing was appended } else if ("img".equals(childNode.getNodeName())) { Node altAtt = childNode.getAttributes().getNamedItem(ALT); String alt = (altAtt != null) ? altAtt.getNodeValue() : null; if (!ImageFeatures.altIsBogus(alt)) { if (result == null) result = StringBuilderUtils.acquire(); else result.append(' '); result.append(alt); } } } if (result != null) XMLTools.unescapeXML(result); return result; } @Override public synchronized void recycle() { if (this.tdNodeAnchorContextStringCache != null) { this.tdNodeAnchorContextStringCache.clear(); this.tdNodeAnchorContextStringCache = null; } if (taggedDoc != null) { taggedDoc.recycle(); taggedDoc = null; } super.recycle(); } }