package ecologylab.bigsemantics.documentparsers;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ecologylab.bigsemantics.actions.SemanticActionHandler;
import ecologylab.bigsemantics.collecting.SemanticsGlobalScope;
import ecologylab.bigsemantics.html.AElement;
import ecologylab.bigsemantics.html.DOMParserInterface;
import ecologylab.bigsemantics.html.DOMWalkInformationTagger;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.ParagraphText;
import ecologylab.bigsemantics.html.documentstructure.AnchorContext;
import ecologylab.bigsemantics.html.documentstructure.ContentPage;
import ecologylab.bigsemantics.html.documentstructure.ImageCollectionPage;
import ecologylab.bigsemantics.html.documentstructure.ImageFeatures;
import ecologylab.bigsemantics.html.documentstructure.IndexPage;
import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure;
import ecologylab.bigsemantics.html.documentstructure.TextOnlyPage;
import ecologylab.bigsemantics.html.utils.HTMLNames;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.bigsemantics.metadata.builtins.RichDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metametadata.MetaMetadata;
import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField;
import ecologylab.generic.StringTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.XMLTools;
/**
* Parse HTML, create DOM, and author Image and Text surrogates from DOM
*
* @author eunyee
*
*/
public class HTMLDOMImageTextParser
extends ParserBase<RichDocument>
implements DOMParserInterface, HTMLNames
{
HashMap<Node, String> tdNodeAnchorContextStringCache;
DOMWalkInformationTagger taggedDoc;
@Override
public Document populateMetadata(Document document, MetaMetadataCompositeField metaMetadata,
org.w3c.dom.Document DOM, SemanticActionHandler handler)
{
recursiveExtraction(metaMetadata, document, DOM, null, handler.getSemanticActionVariableMap());
return document;
}
@Override
public void parse() throws IOException
{
DOMWalkInformationTagger taggedDoc = new DOMWalkInformationTagger(getDocumentClosure().getDocument().getLocation(), this);
// this function actually traverse the dom tree
org.w3c.dom.Document dom = getDom();
// long t0 = System.currentTimeMillis();
taggedDoc.generateCollections(dom);
Node contentBody = getContentBody(taggedDoc);
DOMWalkInformationTagger taggedContentNode = walkAndTagDom(contentBody, this);
extractImageTextSurrogates(taggedDoc, contentBody);
//The information on whether the anchorContexts where produced by the document entirely or the ContentNode.
ArrayList<AnchorContext> anchorContexts = null;
//This document's purl
ParsedURL purl = purl();
boolean fromContentBody = taggedContentNode != null;
if (fromContentBody)
anchorContexts = buildAnchorContexts(taggedContentNode.getAllAnchorNodes(), purl, true);
else
anchorContexts = buildAnchorContexts(taggedDoc.getAllAnchorNodes(), purl, false);
generateCandidateContainersFromContexts(anchorContexts, fromContentBody);
anchorContexts.clear();
taggedDoc.recycle();
taggedDoc = null;
if (fromContentBody)
taggedContentNode.recycle();
// getLogRecord().setMsContentBodyAndClippings(System.currentTimeMillis() - t0);
MetaMetadata metaMetadata = (MetaMetadata) getMetaMetadata();
if (metaMetadata.getSemanticActions() != null || metaMetadata.hasChildren())
{
// t0 = System.currentTimeMillis();
super.parse();
// getLogRecord().setMsImageTextParserCallingSuperParse(System.currentTimeMillis() - t0);
}
}
/**
* This is the walk of the dom that calls print tree, and the parser methods such as closeHref etc.
* @param doc
* @param htmlType
* @return
*/
public DOMWalkInformationTagger walkAndTagDom(Node contentBody, DOMParserInterface htmlType)
{
// note that content body could be null if it is not a content page
if (contentBody == null)
return null;
DOMWalkInformationTagger domTagger = new DOMWalkInformationTagger(getDocumentClosure().getDocument().getLocation(), htmlType);
domTagger.generateCollectionsFromRoot(contentBody);
// walk through the HTML document object.
// gather all paragraphText and image objects in the data structure.
//FIXME -- get rid of this call and object!
// domTagger.printTree(jtidyPrettyOutput, (short)0, 0, null, tdNode);
// domTagger.flushLine(jtidyPrettyOutput, 0);
return domTagger;
}
/**
* Extract Image and Text Surrogates while walk through DOM
*
* historically was called as pprint() in JTidy.
*/
public void extractImageTextSurrogates(DOMWalkInformationTagger taggedDoc, Node contentBody)
{
//System.out.println("\n\ncontentBody = " + contentBody);
ArrayList<ImgElement> imgNodes = taggedDoc.getAllImgNodes();
recognizeDocumentStructureToGenerateSurrogate(taggedDoc, contentBody, imgNodes);
}
/**
* @param taggedDoc
* @return
*/
private Node getContentBody(DOMWalkInformationTagger taggedDoc)
{
Node contentBody = RecognizedDocumentStructure.recognizeContentBody(taggedDoc);
return contentBody;
}
/**
* Recognize the page type based on whether the page has contentBody node or not,
* text length in the whole page, and whether the informative images reside in the page.
*
* Based on the recognized page type, it generates surrogates.
* @param domWalkInfoTagger
* @param contentBody
* @param imgNodes
*/
private void recognizeDocumentStructureToGenerateSurrogate(DOMWalkInformationTagger domWalkInfoTagger,
Node contentBody, ArrayList<ImgElement> imgNodes)
{
RecognizedDocumentStructure pageCategory = null;
if ((contentBody!=null) && (contentBody.getParentNode()!=null) /*&& (!articleMain.parent().equals(document))*/ )
{
pageCategory = new ContentPage(purl());
}
else
{
final int numImgNodes = imgNodes.size();
if( (numImgNodes>0) && ((domWalkInfoTagger.getTotalTxtLength()/numImgNodes)<200) )
{
// High probability to be an image-collection page
pageCategory = new ImageCollectionPage(purl());
}
else if ( numImgNodes!=0 )
{
// Index Pages (include index-content pages)
//FIXME -- should also look at text only pages & especially use link ratio as a feature!!!!
pageCategory = new IndexPage(purl());
}
}
TreeMap<Integer, ParagraphText> paragraphTextsTMap = domWalkInfoTagger.getParagraphTextsTMap();
if (pageCategory != null)
{
pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, this);
}
// No Informative images are in this document. Form surrogate only with text.
// We cannot tell whether the images in the pages are informative or not until downloading all, thus this is the case after we
// look through all the images in the page and determine no image is worth displaying.
if( (numExtractedClippings()==0) && (paragraphTextsTMap.size()>0) )
{
pageCategory = new TextOnlyPage(purl());
pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, this);
}
if (pageCategory != null)
setRecognizedDocumentStructure(pageCategory.getClass());
}
/**
* Transform an set of AElements (HTML a) into a set of AnchorContexts.
* In some cases, an AElement may result in no entry, because the anchor text and anchor context are both empty.
* @param anchorElements
* @param sourcePurl The purl from which these AElements were extracted from.
*
* @return
*/
public ArrayList<AnchorContext> buildAnchorContexts(ArrayList<AElement> anchorElements, ParsedURL sourcePurl, boolean fromContentBody )
{
ArrayList<AnchorContext> anchorNodeContexts = new ArrayList<AnchorContext>();
for (AElement aElement : anchorElements)
{
AnchorContext aContext= constructAnchorContext(aElement, sourcePurl, fromContentBody);
if(aContext!=null)
anchorNodeContexts.add(aContext);
}
return anchorNodeContexts;
}
/**
* Given the a element from the HTML, get its anchor text (text between open and close a tags),
* and its anchor context (surrounding text). If either of these is not null, then return an
* AnchorContext object.
*
* The surrounding text is defined as all the text in the a element's parent node.
* This definition should perhaps be expanded, for example, by trying grandparent if parent
* is either null or the same as anchor text.
*
* @param aElement Anchor HTMLElement (a href=...)
* @param fromContentBody
*
* @return AnchorContext object, or null.
*/
public AnchorContext constructAnchorContext(AElement aElement, ParsedURL sourcePurl, boolean fromContentBody)
{
Node anchorNodeNode = aElement.getNode();
ParsedURL href = aElement.getHref();
if (href != null)
{
//Cache TdNode-AnchorContext getTextInSubTree.
Node parent = anchorNodeNode.getParentNode();
//FIXME -- this routine drops all sorts of significant stuff because it does not concatenate across tags.
StringBuilder anchorContext = null;
if(tdNodeAnchorContextStringCache == null)
tdNodeAnchorContextStringCache = new HashMap<Node, String>();
String anchorContextString = tdNodeAnchorContextStringCache.get(parent);
if(anchorContextString == null)
{
anchorContext = getTextInSubTree(parent, false);
if (anchorContext != null)
{
anchorContextString = StringTools.unescapeAndLowerCaseStringBuilder(anchorContext);
StringBuilderUtils.release(anchorContext);
tdNodeAnchorContextStringCache.put(parent, anchorContextString);
}
}
//TODO: provide ability to specify alternate anchorContext
StringBuilder anchorText = getTextInSubTree(anchorNodeNode, true);
if ((anchorContextString != null) || (anchorText != null))
{
String anchorTextString = null;
if (anchorText != null)
{
anchorTextString = StringTools.unescapeAndLowerCaseStringBuilder(anchorText);
StringBuilderUtils.release(anchorText);
}
return new AnchorContext(href, anchorTextString, anchorContextString, sourcePurl, fromContentBody, false);
}
}
return null;
}
public static StringBuilder getTextInSubTree(Node node, boolean recurse)
{
return getTextinSubTree(node, recurse, null);
}
/**
* Non-recursive method to get the text for the <code>node</code>
* Collects the text even if the node contains other nodes in between,
* specifically the <code>anchor</code>. It does not however include the
* text from the anchor node, as it exists in the anchorText
* @param node
* @param te
* @return
*/
public static StringBuilder getTextinSubTree(Node node, boolean recurse, StringBuilder result)
{
NodeList children = node.getChildNodes();
for (int i=0; i<children.getLength(); i++)
{
Node childNode = children.item(i);
if (recurse && (childNode.getNodeName()!=null) && (!childNode.getNodeName().equals("script")))
{
//Recursive call with the childNode
result = getTextinSubTree(childNode, true, result);
}
else if (childNode.getNodeType() == Node.TEXT_NODE )
{
int length = 0;
if (result != null)
{
result.append(' '); // append space to separate text chunks
length = result.length();
}
result = StringBuilderUtils.trimAndDecodeUTF8(result, childNode, 0, true);
if ((result != null) && (length == result.length()) && (length > 0))
result.setLength(length - 1); // take the space off if nothing was appended
}
else if ("img".equals(childNode.getNodeName()))
{
Node altAtt = childNode.getAttributes().getNamedItem(ALT);
String alt = (altAtt != null) ? altAtt.getNodeValue() : null;
if (!ImageFeatures.altIsBogus(alt))
{
if (result == null)
result = StringBuilderUtils.acquire();
else
result.append(' ');
result.append(alt);
}
}
}
if (result != null)
XMLTools.unescapeXML(result);
return result;
}
@Override
public synchronized void recycle()
{
if (this.tdNodeAnchorContextStringCache != null)
{
this.tdNodeAnchorContextStringCache.clear();
this.tdNodeAnchorContextStringCache = null;
}
if (taggedDoc != null)
{
taggedDoc.recycle();
taggedDoc = null;
}
super.recycle();
}
}