package ecologylab.bigsemantics.documentparsers; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.util.ArrayList; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathExpressionException; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import ecologylab.appframework.types.prefs.PrefBoolean; import ecologylab.bigsemantics.collecting.SemanticsSite; import ecologylab.bigsemantics.html.DOMParserInterface; import ecologylab.bigsemantics.html.ImgElement; import ecologylab.bigsemantics.html.ParagraphText; import ecologylab.bigsemantics.html.documentstructure.AnchorContext; import ecologylab.bigsemantics.html.documentstructure.ImageFeatures; import ecologylab.bigsemantics.html.documentstructure.LinkType; import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure; import ecologylab.bigsemantics.html.documentstructure.SemanticAnchor; import ecologylab.bigsemantics.html.dom.IDOMProvider; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.bigsemantics.metadata.builtins.RichDocument; import ecologylab.bigsemantics.metadata.builtins.Document; import ecologylab.bigsemantics.metadata.builtins.Image; import ecologylab.bigsemantics.metadata.builtins.ImageClipping; import ecologylab.bigsemantics.metadata.builtins.TextClipping; import ecologylab.bigsemantics.model.text.utils.Filter; import ecologylab.bigsemantics.seeding.SemanticsPrefs; import ecologylab.generic.HashMapArrayList; import ecologylab.generic.StringTools; import ecologylab.net.ParsedURL; import ecologylab.serialization.XMLTools; /** * Parse HTML page and create DOM * * @author eunyee * */ public abstract class HTMLDOMParser<D extends Document> extends ContainerParser<D> implements DOMParserInterface { static final PrefBoolean SHOW_PAGE_STRUCTURE_PREF = PrefBoolean.usePrefBoolean("show_page_structure", false); public static int MAX_TEXT_CONTEXT_LENGTH = 1500; /** * trim the text context under the limit if it is too long. * * @param textContext * @return */ // FIXME -- call site for this should really be when we build contexts in walkAndTagDOM() public static String trimTooLongContext(String textContext) { if (textContext.length() > MAX_TEXT_CONTEXT_LENGTH) return textContext.substring(0, MAX_TEXT_CONTEXT_LENGTH); else return textContext; } /** <code>Filter</code> that recognizes junk images from URL */ public static final Filter filter = new Filter(); // url filtering /** * Root DOM of the current document */ private org.w3c.dom.Document dom; protected IDOMProvider provider; boolean indexPage = false; boolean contentPage = false; private boolean isFile = false; protected boolean bold; protected boolean italic; public HTMLDOMParser() { super(); } public org.w3c.dom.Document getDom() throws IOException { org.w3c.dom.Document result = this.dom; if (result == null) { result = createDom(); this.dom = result; } return result; } /** * * @return The DOM provided by the input stream or a bogus DOM with a root node empty * @throws IOException */ private org.w3c.dom.Document createDom() throws IOException { // long t0 = System.currentTimeMillis(); provider = getSemanticsScope().constructDOMProvider(); InputStream inputStream = inputStream(); Reader reader = reader(); org.w3c.dom.Document document = null; try { document = reader != null ? provider.parseDOM(reader, null) : provider.parseDOM(inputStream, null); // getLogRecord().setMsDomCreation(System.currentTimeMillis() - t0); } finally { if (inputStream != null) { inputStream.close(); } } return document; } /** * * @return The root node of the document, which should be <html>. * @throws IOException */ public Node getRootNode() throws IOException { return getDom(); } /** * Andruid says: NEVER override this method when you parse HTML. Instead, override postParse(). * * @throws IOException */ @Override abstract public void parse() throws IOException; @Override public void recycle() { dom = null; provider = null; super.recycle(); } /** * Called when the parser see's the <code><title> tag. */ public void setTitle(Node titleNode) { StringBuilder title = null; NodeList children = titleNode.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node node = children.item(i); if (node.getNodeType() == Node.TEXT_NODE) { title = StringBuilderUtils.trimAndDecodeUTF8(title, node, 0, true); if (title != null) { XMLTools.unescapeXML(title); getDocument().hwSetTitle(StringTools.toString(title)); StringBuilderUtils.release(title); } break; } } } /** * Create TextElement and add to the localCollection in Container. */ public void constructTextClipping(ParagraphText paraText) { if ((paraText != null) && (paraText.length() > 0)) { StringBuilder buffy = paraText.getBuffy(); if (buffy.indexOf("@") == -1) // filter out paragraphs with email addresses { TextClipping textClipping = new TextClipping(getSemanticsScope() .getMetaMetadataRepository().getMMByName(getSemanticsScope().TEXT_TAG)); textClipping.setText(StringTools.toString(buffy)); textClipping.setSourceDoc(getDocument()); ((RichDocument) getDocument()).addClipping(textClipping); } } } public int numExtractedClippings() { return ((RichDocument) getDocument()).numClippings(); } /** * For each anchorContext: create purl and check to see Aggregates AnchorContext by their * destination hrefs. sets the metadata creates a container adds an outlink from the ancestor * */ public void generateCandidateContainersFromContexts(ArrayList<AnchorContext> anchorContexts, boolean fromContentBody) { HashMapArrayList<ParsedURL, ArrayList<AnchorContext>> hashedAnchorContexts = new HashMapArrayList<ParsedURL, ArrayList<AnchorContext>>(); for (AnchorContext anchorContext : anchorContexts) { ParsedURL destHref = anchorContext.getHref(); if (destHref.isImg()) { // The href associated is actually an image. Create a new img element and associate text to // it. Image newImage = getSemanticsScope().getOrConstructImage(destHref); if (newImage != null) newImage.constructClipping(getDocument(), null, null, anchorContext.getAnchorText()); continue; } ArrayList<AnchorContext> arrayList = hashedAnchorContexts.get(destHref); if (arrayList == null) { arrayList = new ArrayList<AnchorContext>(); hashedAnchorContexts.put(destHref, arrayList); } arrayList.add(anchorContext); } // Now that we have aggregated AnchorContext, // We generate One SemanticAnchor per purl, that aggregates all the semantics of the set of // anchorContexts for (ParsedURL hrefPurl : hashedAnchorContexts.keySet()) { ArrayList<AnchorContext> anchorContextsPerHref = hashedAnchorContexts.get(hrefPurl); SemanticAnchor semanticAnchor = new SemanticAnchor(fromContentBody ? LinkType.WILD_CONTENT_BODY : LinkType.WILD, hrefPurl, anchorContextsPerHref, purl(), 1); handleSemanticAnchor(semanticAnchor, hrefPurl); } } protected void handleSemanticAnchor(SemanticAnchor semanticAnchor, ParsedURL hrefPurl) { // FIXME -- should we depend on Seeding here?? or do this in post-processing for // CompoundDocumentParserCrawlerResult?? if (hrefPurl != null && !hrefPurl.isNull() && getSemanticsScope().accept(hrefPurl)) { Document hrefDocument = getSemanticsScope().getOrConstructDocument(hrefPurl); if (hrefDocument == null || hrefDocument.isRecycled()) { warning("hrefDocument is null or recycled: " + hrefPurl); return; // Should actually raise an exception, but this could happen when a container is not // meant to be reincarnated } Document sourceDocument = getDocument(); hrefDocument.addSemanticInlink(semanticAnchor, sourceDocument); sourceDocument.addCandidateOutlink(hrefDocument); } } public void setRecognizedDocumentStructure(Class<? extends RecognizedDocumentStructure> pageType) { if (SHOW_PAGE_STRUCTURE_PREF.value()) { RichDocument metadata = (RichDocument) getDocument(); if (metadata != null) metadata.setPageStructure(pageType.getSimpleName()); else error("Can't setPageStructure() cause NULL Metadata :-("); } } @Override public void setContent() { contentPage = true; } @Override public void setIndexPage() { indexPage = true; } @Override public boolean isIndexPage() { return indexPage; } @Override public boolean isContentPage() { return contentPage; } @Override public void removeTheContainerFromCandidates(ParsedURL containerPURL) { warning("Not Implemented: removeTheContainerFromCandidates(" + containerPURL); } /** * Parser found a bold (or strong) tag or an end bold tag. */ public void setBold(boolean on) { bold = on; } /** * Parser found an italic (or em) tag or an end italic tag. */ public void setItalic(boolean on) { italic = on; } protected ParsedURL buildAndFilterPurl(String urlString) { ParsedURL result = buildPurl(urlString); return (result != null) && filterPurl(result) ? result : null; } /** * Filters the parsedURL to check if <li>infoCollector accepts <li>name does not start with File * <li>is Crawlable <br> * Do less checking if it's drag'n'drop (container==null) * * @param urlString * @return */ protected boolean filterPurl(ParsedURL parsedURL) { Document document = getDocument(); // FIXME -- should we depend on Seeding here?? or do this in post-processing for // CompoundDocumentParserCrawlerResult?? return (parsedURL != null && getSemanticsScope().accept(parsedURL) && (!parsedURL.getName().startsWith("File:") && parsedURL.crawlable() && !document.isJustCrawl() && (document == null || parsedURL .isImg() || (isFile && parsedURL .isHTML()) || !isFile))); } protected ParsedURL buildPurl(String urlString) { Document sourceDocument = getDocument(); return sourceDocument.isAnonymous() ? ParsedURL.createFromHTML(null, urlString, false) : sourceDocument.getLocation().createFromHTML(urlString, isSearchPage()); } /** * add an image+text surrogate for this that was extracted from a different document. FIXME this * currently does the same thing as a surrogate extracted from this, but we might want to make a * special collection for these "anchor surrogates". * * Really, this should be setting the outlink somehow... */ public ImageClipping constructAnchorImageClipping(ImgElement imgNode, ParsedURL anchorHref) { RichDocument source = (RichDocument) getDocumentClosure().getDocument(); ImageClipping clipping = constructImageClipping(getDocument(), source, null, imgNode); // CompoundDocument outlink = (CompoundDocument) // getSemanticsScope().getOrConstructDocument(anchorHref); // clipping.setOutlink(outlink); return clipping; } /** * create image and text surrogates for this HTML document, and add these surrogates into the * localCollection in Container. */ public ImageClipping constructImageClipping(ImgElement imgNode, ParsedURL anchorHref) { if (getDocumentClosure().location().equals(anchorHref)) debug("This should be something else here!!"); debug("PART 1 " + anchorHref); Document outlink = getSemanticsScope().getOrConstructDocument(anchorHref); Document sourceDocument = getDocument(); return constructImageClipping(sourceDocument, sourceDocument, outlink, imgNode); } /** * Construct an ImageClipping, associating it properly in the hypermedia graph. * * @param basisDocument * The CompoundDocument to add the clipping to. * @param sourceDocument * The CompoundDocument to be listed as the Clipping's source. The one it is a surrogate * for. Usually the same as basisDocument, but for a surrogate for X, found in Y, instead * uses outlink here. * @param outlink * The Document to be listed as the Clipping's href destination. * @param imgNode * Representation of the source HTML + textContext and additional extractedCaption. * * @return */ public ImageClipping constructImageClipping(Document basisDocument, Document sourceDocument, Document outlink, ImgElement imgNode) { ParsedURL srcPurl = imgNode.getSrc(); ImageClipping result = null; if (srcPurl != null) { int width = imgNode.getWidth(); int height = imgNode.getHeight(); int mimeIndex = srcPurl.mediaMimeIndex(); boolean isMap = imgNode.isMap(); switch (ImageFeatures.designRole(width, height, mimeIndex, isMap)) { case ImageFeatures.INFORMATIVE: case ImageFeatures.UNKNOWN: String alt = imgNode.getAlt(); if (alt != null) alt = alt.trim(); Image image = getSemanticsScope().getOrConstructImage(srcPurl); if (image == null) return null; image.setWidth(width); image.setHeight(height); result = image.constructClipping(basisDocument, sourceDocument, outlink, alt, imgNode.getTextContext()); result.setXpath(imgNode.xpath()); break; case ImageFeatures.UN_INFORMATIVE: default: getSemanticsScope().getLocalDocumentCollection().registerUninformativeImage(srcPurl); } } return result; } public static boolean isAd(ParsedURL hrefPurl) { String lc = hrefPurl.lc(); boolean filterMatch = SemanticsPrefs.FILTER_OUT_ADS.value() && filter.matchLc(lc); return filterMatch; } /** * @return true if <code>this</code> is a search page, and so needs special parsing of URLs, to * unpack nested entries. false in all other (usual) cases. */ public boolean isSearchPage() { return false; } protected void findFaviconPath(Document doc, XPath xpath) { if (dom == null) { warning("DOM is null (maybe direct binding case), not doing favicon looking up by xpath."); return; } String favi_res = ""; try { favi_res = xpath.evaluate("//link[@rel=\"shortcut icon\"]/@href", dom); } catch (XPathExpressionException e) { e.printStackTrace(); } if (favi_res != null && favi_res != "") { // Found one // System.out.println("Got a path: " + favi_res); SemanticsSite site = doc.getSite(); site.setFaviconPath(favi_res, doc.getLocation()); } else { // Did not. Look in the root. } } }