package ecologylab.bigsemantics.documentparsers; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import ecologylab.bigsemantics.collecting.SemanticsGlobalScope; import ecologylab.bigsemantics.html.DOMParserInterface; import ecologylab.bigsemantics.html.ImgElement; import ecologylab.bigsemantics.html.utils.HTMLNames; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.bigsemantics.metadata.builtins.AnonymousDocument; import ecologylab.bigsemantics.metadata.builtins.Document; import ecologylab.bigsemantics.metadata.builtins.DocumentClosure; import ecologylab.bigsemantics.metadata.builtins.Image; import ecologylab.bigsemantics.metadata.builtins.ImageClipping; import ecologylab.generic.Continuation; import ecologylab.generic.DomTools; import ecologylab.net.ParsedURL; import ecologylab.serialization.XMLTools; public class HTMLFragmentDOMParser extends HTMLDOMParser implements DOMParserInterface, HTMLNames { InputStream fragmentStream; Reader reader; ArrayList<ImageClipping> imageClippings = new ArrayList<ImageClipping>(); ParsedURL containerPurl; Document containerDocument; Document textOutlink; StringBuilder bodyTextBuffy = new StringBuilder(); SpecialImageUrlHandler specialImageUrlHandler = new SpecialImageUrlHandler(); private static HashMap<String, Integer> namesOfBreaklineNodeNames = null; public HTMLFragmentDOMParser(Reader reader, InputStream inputStream) { super(); fragmentStream = inputStream; this.reader = reader; AnonymousDocument anonymousDocument = new AnonymousDocument(); setDocumentClosure(anonymousDocument.getOrConstructClosure()); } @Override public void parse() throws IOException { org.w3c.dom.Document dom = getDom(); //DomTools.prettyPrint(dom); int containerNodeIndex = 0; NodeList bodyNodeList = dom.getElementsByTagName(BODY); if (bodyNodeList.getLength() > 0) { Node bodyNode = bodyNodeList.item(0); parseText(bodyTextBuffy, bodyNode); checkForSimplSourceLocation(bodyNode); checkForMetadata(bodyNode); } parseImages(dom); } private void parseImages(org.w3c.dom.Document dom) { NodeList imgNodeList = dom.getElementsByTagName(IMG); int numImages = imgNodeList.getLength(); if (numImages > 0) { for (int i = 0; i < numImages; i++) { Node imgNode = imgNodeList.item(i); Node parent = imgNode.getParentNode(); Document outlink = null; boolean changeSourceDoc = false; String src = DomTools.getAttribute(imgNode, SRC); src = specialImageUrlHandler.changeImageUrlIfNeeded(src); ParsedURL imgPurl = ImgElement.constructPurl(containerPurl, src); do { if (A.equals(parent.getNodeName())) { String hrefString = DomTools.getAttribute(parent, HREF); if (hrefString != null) { try { if (imgPurl == null) { String srcUrl = specialImageUrlHandler.getImageUrlFromParameters(hrefString); if (srcUrl != null) imgPurl = ImgElement.constructPurl(containerPurl, srcUrl); } if (imgPurl == null) break; StringBuilder newImgHrefBuf = StringBuilderUtils.acquire(); changeSourceDoc = specialImageUrlHandler.changeImageRefUrlAndSourceDocIfNeeded(hrefString, newImgHrefBuf); hrefString = newImgHrefBuf.length() > 0 ? newImgHrefBuf.toString() : hrefString; StringBuilderUtils.release(newImgHrefBuf); } catch (UnsupportedEncodingException e) { error("Image ref URL cannot be decoded because it is using unsupported encoding. " + "We support UTF-8 only."); e.printStackTrace(); } ParsedURL aHref = ImgElement.constructPurl(containerPurl, hrefString); if (aHref != null) outlink = getSemanticsScope().getOrConstructDocument(aHref); } break; } parent = parent.getParentNode(); } while (parent != null); if (imgPurl == null) continue; SemanticsGlobalScope semanticsSessionScope = getSemanticsScope(); Image image = semanticsSessionScope.getOrConstructImage(imgPurl); if (image != null) { String altText = DomTools.getAttribute(imgNode, ALT); final ImageClipping imageClipping = image.constructClipping(containerDocument, null /*outlink*/, altText, null); if (changeSourceDoc) { outlink.queueDownload(new Continuation<DocumentClosure>() { @Override public void callback(DocumentClosure o) { Document downloadedDoc = o.getDocument(); if (downloadedDoc != null && !downloadedDoc.isRecycled()) { imageClipping.setSourceDoc(downloadedDoc); imageClipping.setOutlinks(null); imageClipping.setMetadataChanged(true); } } }); } imageClippings.add(imageClipping); } } } } public void parseText(StringBuilder buffy, Node bodyNode) { //debug("Node:" + bodyNode.getNodeName() + ":" + bodyNode.getNodeValue()); NodeList children = bodyNode.getChildNodes(); boolean addLine = false; // this is outside of the loop below to make it work correctly for (int i = 0; i < children.getLength(); i++) { Node kid = children.item(i); if (A.equals(kid.getNodeName()) && textOutlink == null) // first cut; needs refinement { String hrefString = DomTools.getAttribute(kid, HREF); if (hrefString != null) { ParsedURL aHref = ImgElement.constructPurl(containerPurl, hrefString); if (aHref != null) textOutlink = getSemanticsScope().getOrConstructDocument(aHref); } } if (addLine == false) addLine = shouldBreakLineWithNodeName(kid.getNodeName()); if (kid.getNodeValue() != null) { String v = kid.getNodeValue(); if (kid.getNodeName().equals(HASH_COMMENT)) continue; addWithOneSpaceBetween(buffy, v, false); if (addLine) { buffy.append('\n'); addLine = false; } } else if(shouldBreakLineWithNodeName(kid.getNodeName())) { buffy.append('\n'); } //addWithOneSpaceBetween(buffy, walkDomAddingTextAndAddNewlines(kid), true); parseText(buffy, kid); } } private static void addWithOneSpaceBetween(StringBuilder buffy, String v, boolean newlineOK) { char lastChar = (buffy.length() > 0) ? buffy.charAt(buffy.length() - 1) : ' '; if (lastChar != '\n') buffy.append(' '); if (!newlineOK ) v = v.replaceAll("\\n", " "); v = v.replaceAll("^[\\s]+", ""); v = v.replaceAll("[\\s]+", " "); if (v.length() > 0) buffy.append(v); } /** * @author rhema returns true when a breakline would make sense based on the node name. * * @param nodeName * such as p, div, br * @return */ private static boolean shouldBreakLineWithNodeName(String nodeName) { String name = nodeName.toLowerCase(); if (namesOfBreaklineNodeNames == null) { namesOfBreaklineNodeNames = new HashMap<String, Integer>(); namesOfBreaklineNodeNames.put("p", 1); namesOfBreaklineNodeNames.put("h1", 1); namesOfBreaklineNodeNames.put("h2", 1); namesOfBreaklineNodeNames.put("h3", 1); namesOfBreaklineNodeNames.put("h4", 1); namesOfBreaklineNodeNames.put("h5", 1); namesOfBreaklineNodeNames.put("h6", 1); namesOfBreaklineNodeNames.put("br", 1); namesOfBreaklineNodeNames.put("div", 1); } return namesOfBreaklineNodeNames.containsKey(name); } void checkForSimplSourceLocation(Node node) { node.getAttributes(); if (node.getAttributes() != null && setContainerLocation(node) != null) { return; } NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { checkForSimplSourceLocation(children.item(i)); } } private ParsedURL setContainerLocation(Node elementNode) { if (containerPurl == null && elementNode != null) { String containerLocation = DomTools.getAttribute(elementNode, SIMPL_SOURCE_LOCATION); if (containerLocation == null || containerLocation.length() == 0) containerLocation = DomTools.getAttribute(elementNode, SIMPL); if (containerLocation == null || containerLocation.length() == 0) containerLocation = DomTools.getAttribute(elementNode, CONTAINER); if (containerLocation != null && containerLocation.length() > 0) { containerLocation = XMLTools.unescapeXML(containerLocation); containerPurl = ParsedURL.getAbsolute(containerLocation); containerDocument = getSemanticsScope().getOrConstructDocument(containerPurl); } } return containerPurl; } void checkForMetadata(Node node) { node.getAttributes(); if (node.getAttributes() != null && parseInjectedMetadata(node) != null) { return; } NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { checkForMetadata(children.item(i)); } } private ParsedURL parseInjectedMetadata(Node elementNode) { if (containerPurl == null && elementNode != null) { String containerMetadata = DomTools.getAttribute(elementNode, SIMPL_METADATA); DomTools.prettyPrint(elementNode); if (containerMetadata != null && containerMetadata.length() > 0) { System.out.println("\n\nsimpl:metadata:\n"+containerMetadata+"\n\n"); Document metadataFromBrowser = Document.constructAndMapFromJson(containerMetadata, getSemanticsScope()); if (metadataFromBrowser != null) { // workflows need to be modified to accomodate metadata coming from drag System.out.println("\nSetting container document to injected metadata\n"); containerDocument = metadataFromBrowser; containerPurl = metadataFromBrowser.getLocation(); } } } return containerPurl; } @Override public InputStream inputStream() { return fragmentStream; } @Override public Reader reader() { return reader; } public String getBodyText() { return bodyTextBuffy.toString(); } public Document getTextOutlink() { return textOutlink; } public ArrayList<ImageClipping> getImageClippings() { return imageClippings; } public void setContent() { } public void setIndexPage() { } public Document getContainerDocument() { return containerDocument; } public ParsedURL getContainerPurl() { return containerPurl; } @Override public void recycle() { fragmentStream = null; reader = null; imageClippings.clear(); imageClippings = null; containerPurl = null; containerDocument = null; textOutlink = null; bodyTextBuffy = null; super.recycle(); } }