package ecologylab.bigsemantics.html.standalone; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; import java.io.StringWriter; import java.util.ArrayList; import java.util.TreeMap; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import ecologylab.bigsemantics.html.AElement; import ecologylab.bigsemantics.html.DOMParserInterface; import ecologylab.bigsemantics.html.DOMWalkInformationTagger; import ecologylab.bigsemantics.html.ImgElement; import ecologylab.bigsemantics.html.ParagraphText; import ecologylab.bigsemantics.html.documentstructure.AnchorContext; import ecologylab.bigsemantics.html.documentstructure.ContentPage; import ecologylab.bigsemantics.html.documentstructure.ImageCollectionPage; import ecologylab.bigsemantics.html.documentstructure.ImageFeatures; import ecologylab.bigsemantics.html.documentstructure.IndexPage; import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure; import ecologylab.bigsemantics.html.documentstructure.TextOnlyPage; import ecologylab.bigsemantics.html.dom.IDOMProvider; import ecologylab.bigsemantics.html.utils.HTMLNames; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.generic.StringTools; import ecologylab.net.PURLConnection; import ecologylab.net.ParsedURL; import ecologylab.serialization.XMLTools; /** * WARNING: This code is deprecated in cF and ecologylabSemantics. * It remains only for the purpose of maintaining some standalone classes, which Andruid * imagines are for algorithm verification (and thus, useful.) * <p/> * * Connect to JTidy parser to parse HTML pages for standalone algorithm measurement. * This parsing code integrates with the Image-Text Surrogate extractor code. * * @author eunyee * */ @Deprecated public class OldHTMLDOMParser implements HTMLNames, IDOMProvider { PURLConnection purlConnection; IDOMProvider provider; /** * because Tidy extends Serializable */ private static final long serialVersionUID = 1L; public OldHTMLDOMParser() { super(); } /** * Parse HTML Document, and return the root DOM node * * @param in * @param purl TODO * @param out * @param tidyInterface * @throws IOException */ public org.w3c.dom.Document parse(PURLConnection purlConnection) throws IOException { this.purlConnection = purlConnection; return provider.parseDOM(purlConnection.inputStream(), null); } /** * Extract Image and Text surrogates while walk through DOM * * @param in * @param htmlType * @throws IOException */ public void parse(PURLConnection purlConnection, DOMParserInterface htmlType) throws IOException { Document parsedDoc = parse(purlConnection); DOMWalkInformationTagger taggedDoc = walkAndTagDom(parsedDoc, htmlType); extractImageTextSurrogates(taggedDoc, htmlType); //Now, find hrefs, with their context and generate containers with metadata ArrayList<AnchorContext> anchorContexts = buildAnchorContexts(taggedDoc.getAllAnchorNodes(), purl()); if(htmlType != null) htmlType.generateCandidateContainersFromContexts(anchorContexts, false); anchorContexts.clear(); taggedDoc.recycle(); } /** * This is the walk of the dom that calls print tree, and the parser methods such as closeHref etc. * @param doc * @param htmlType * @return */ public DOMWalkInformationTagger walkAndTagDom(Node rootTdNode, DOMParserInterface htmlType) { // jtidyPrettyOutput.state = StreamIn.FSM_ASCII; // jtidyPrettyOutput.encoding = configuration.CharEncoding; DOMWalkInformationTagger domTagger = new DOMWalkInformationTagger(purlConnection.getPurl(), htmlType); StringWriter writer = new StringWriter(); // walk through the HTML document object. // gather all paragraphText and image objects in the data structure. //FIXME -- get rid of this call and object! domTagger.tagTree(rootTdNode); return domTagger; } /** * Extract Image and Text Surrogates while walk through DOM * * historically was called as pprint() in JTidy. */ public void extractImageTextSurrogates(DOMWalkInformationTagger taggedDoc, DOMParserInterface htmlType) { Node contentBody = RecognizedDocumentStructure.recognizeContentBody(taggedDoc); //System.out.println("\n\ncontentBody = " + contentBody); ArrayList<ImgElement> imgNodes = taggedDoc.getAllImgNodes(); recognizeDocumentStructureToGenerateSurrogate(htmlType, taggedDoc, contentBody, imgNodes); } /** * Recognize the page type based on whether the page has contentBody node or not, * text length in the whole page, and whether the informative images reside in the page. * * Based on the recognized page type, it generates surrogates. * * @param htmlType * @param domWalkInfoTagger * @param contentBody * @param imgNodes */ private void recognizeDocumentStructureToGenerateSurrogate(DOMParserInterface htmlType, DOMWalkInformationTagger domWalkInfoTagger, Node contentBody, ArrayList<ImgElement> imgNodes) { RecognizedDocumentStructure pageCategory = null; if( contentBody!=null ) { // Content Pages pageCategory = new ContentPage(purl()); } else { final int numImgNodes = imgNodes.size(); if( (numImgNodes>0) && ((domWalkInfoTagger.getTotalTxtLength()/numImgNodes)<200) ) { // High probability to be an image-collection page pageCategory = new ImageCollectionPage(purl()); } else if( numImgNodes!=0 ) { // Index Pages (include index-content pages) //FIXME -- should also look at text only pages & especially use link ratio as a feature!!!! pageCategory = new IndexPage(purl()); } } TreeMap<Integer, ParagraphText> paragraphTextsTMap = domWalkInfoTagger.getParagraphTextsTMap(); if (pageCategory != null) { pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, htmlType); } // No Informative images are in this document. Form surrogate only with text. // We cannot tell whether the images in the pages are informative or not until downloding all, thus this is the case after we // look through all the images in the page and determine no image is worth displaying. if( (htmlType.numExtractedClippings()==0) && (paragraphTextsTMap.size()>0) ) { pageCategory = new TextOnlyPage(purl()); pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, htmlType); } if (pageCategory != null) htmlType.setRecognizedDocumentStructure(pageCategory.getClass()); } /** * Transform an set of AElements (HTML a) into a set of AnchorContexts. * In some cases, an AElement may result in no entry, because the anchor text and anchor context are both empty. * @param anchorElements * * @return */ public ArrayList<AnchorContext> buildAnchorContexts(ArrayList<AElement> anchorElements, ParsedURL sourcePurl) { ArrayList<AnchorContext> anchorNodeContexts = new ArrayList<AnchorContext>(); for (AElement aElement : anchorElements) { AnchorContext aContext= constructAnchorContext(aElement, sourcePurl); if(aContext!=null) { anchorNodeContexts.add(aContext); } } return anchorNodeContexts; } /** * Given the a element from the HTML, get its anchor text (text between open and close a tags), * and its anchor context (surrounding text). If either of these is not null, then return an * AnchorContext object. * * The surrounding text is defined as all the text in the a element's parent node. * This definition should perhaps be expanded, for example, by trying grandparent if parent * is either null or the same as anchor text. * * @param aElement Anchor HTMLElement (a href=...) * * @return AnchorContext object, or null. */ public AnchorContext constructAnchorContext(AElement aElement, ParsedURL sourcePurl) { Node anchorNodeNode = aElement.getNode(); ParsedURL href = aElement.getHref(); if (href != null) { Node parent = anchorNodeNode.getParentNode(); //FIXME -- this routine drops all sorts of significant stuff because it does not concatenate across tags. StringBuilder anchorContext = getTextInSubTree(parent, false); //TODO: provide ability to specify alternate anchorContext StringBuilder anchorText = getTextInSubTree(anchorNodeNode, true); if ((anchorContext != null) || (anchorText != null)) { String anchorContextString = null; if (anchorContext != null) { XMLTools.unescapeXML(anchorContext); StringTools.toLowerCase(anchorContext); anchorContextString = StringTools.toString(anchorContext); StringBuilderUtils.release(anchorContext); } String anchorTextString = null; if (anchorText != null) { XMLTools.unescapeXML(anchorText); StringTools.toLowerCase(anchorText); anchorTextString = StringTools.toString(anchorText); StringBuilderUtils.release(anchorText); } return new AnchorContext(href, anchorTextString, anchorContextString, sourcePurl, false, false); } } return null; } public static StringBuilder getTextInSubTree(Node node, boolean recurse) { return getTextinSubTree(node, recurse, null); } /** * Non-recursive method to get the text for the <code>node</code> * Collects the text even if the node contains other nodes in between, * specifically the <code>anchor</code>. It does not however include the * text from the anchor node. * @param node * @param te * @return */ //FIXME -- why is text in anchor node not included? public static StringBuilder getTextinSubTree(Node node, boolean recurse, StringBuilder result) { NodeList children = node.getChildNodes(); for (int i=0; i<children.getLength(); i++) { Node childNode = children.item(i); if (recurse && (childNode.getNodeName()!=null) && (!childNode.getNodeName().equals("script"))) { //Recursive call with the childNode result = getTextinSubTree(childNode, true, result); } else if (childNode.getNodeType() == Node.TEXT_NODE ) { int length = 0; if (result != null) { result.append(' '); // append space to separate text chunks length = result.length(); } result = StringBuilderUtils.trimAndDecodeUTF8(result, childNode, 0, true); if ((result != null) && (length == result.length())) result.setLength(length - 1); // take the space off if nothing was appended } else if ("img".equals(childNode.getNodeName())) { Node altAtt = childNode.getAttributes().getNamedItem(ALT); String alt = (altAtt != null) ? altAtt.getNodeValue() : null; if (!ImageFeatures.altIsBogus(alt)) { if (result == null) result = StringBuilderUtils.acquire(); else result.append(' '); result.append(alt); } } } if (result != null) XMLTools.unescapeXML(result); return result; } ParsedURL purl() { return purlConnection.getPurl(); } @Override public void setQuiet(boolean b) { // TODO Auto-generated method stub } @Override public void setShowWarnings(boolean b) { // TODO Auto-generated method stub } @Override public Document parseDOM(InputStream inputStream, OutputStream out) { // TODO Auto-generated method stub return null; } @Override public String xPathTagNamesToLower(String xpath) { // TODO Auto-generated method stub return null; } @Override public Document parseDOM(Reader reader, OutputStream out) throws IOException { // TODO Auto-generated method stub return null; } }