package ecologylab.bigsemantics.html; import java.io.FileOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collection; import java.util.TreeMap; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import ecologylab.bigsemantics.html.documentstructure.ImageFeatures; import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure; import ecologylab.bigsemantics.html.utils.HTMLNames; import ecologylab.bigsemantics.html.utils.StringBuilderUtils; import ecologylab.net.ParsedURL; import ecologylab.serialization.XMLTools; /** * Walking through DOM and tag necessary information * * Extends Jtidy's PPrint object to keep state necessary for image+text surrogate extraction * * @author eunyee * */ public class DOMWalkInformationTagger implements HTMLNames { protected static final int MAX_LINKS_PER_PAGE = 200; protected static final int PARA_TEXT_LENGTH_LIMIT = 80; DOMParserInterface parserInterface; ParsedURL purl; int encoding; int state; /** * Current DOM node that is being processed */ protected org.w3c.dom.Node currentNode = null; /** * Keep track of the text length in this page to recognize the page type. */ protected int totalTxtLength = 0; /** * Collection of text elements until a block level element is reached */ protected ParagraphText currentParagraphText = new ParagraphText(); /** * Keep the array of the paragraph texts in the article body. * */ private TreeMap<Integer, ParagraphText> paragraphTextsTMap = new TreeMap<Integer, ParagraphText>(); /** * All images in the page */ protected ArrayList<ImgElement> allImgNodes = new ArrayList<ImgElement>(); /** * All links in current page */ protected ArrayList<AElement> allAnchorNodes = new ArrayList<AElement>(); private int[] linebuf; private int lbufsize; private int linelen; String partitionID = ""; FileOutputStream fileOutputStream = null; public DOMWalkInformationTagger(ParsedURL purl, DOMParserInterface parserInterface) { this.purl = purl; this.parserInterface = parserInterface; } public void generateCollections(Node doc) { Element root = null; NodeList list = doc.getChildNodes(); for (int i=0; i<list.getLength(); i++) { if (list.item(i) instanceof Element) { root = (Element)list.item(i); break; } } generateCollectionsFromRoot(root); } public void generateCollectionsFromRoot(Node root) { tagTree(root); } public void tagTree(Node node) { Node content; currentNode = node; if (node == null) return; short nodeType = node.getNodeType(); switch (nodeType) { case Node.TEXT_NODE: if (node.getNodeValue() != null && node.getNodeValue().length() > 0) tagText(node.getNodeValue().getBytes(), 0, node.getNodeValue().length(), node); break; case Node.ELEMENT_NODE: if (!node.getNodeName().toLowerCase().equals("script") && (!node.getNodeName().toLowerCase().equals("style"))) { printTag(node); NodeList children = node.getChildNodes(); for (int i=0; i<children.getLength(); i++) { Node child = children.item(i); tagTree(child); } printEndTag(node); } break; case Node.DOCUMENT_NODE: NodeList children = node.getChildNodes(); for (int i=0; i<children.getLength(); i++) { content = children.item(i); tagTree(content); } break; default: printTag(node); NodeList children2 = node.getChildNodes(); for (int i=0; i<children2.getLength(); i++) { Node child = children2.item(i); tagTree(child); } printEndTag(node); } } public void printTag(Node node) { String tagName = node.getNodeName().toLowerCase(); if (tagName.equals("img")) { ImgElement imgElement = new ImgElement(node, purl); // TODO confirm that we are happy only collecting images that seem informative if (imgElement.isInformativeImage()) allImgNodes.add(imgElement); } else if (tagName.equals("base")) { Node baseHrefAttr = node.getAttributes().getNamedItem("href"); String baseHref = (baseHrefAttr == null) ? null : baseHrefAttr.getNodeValue(); if (baseHref != null) purl = (purl == null) ? ParsedURL.getAbsolute(baseHref) : purl.getRelative(baseHref); } else if (parserInterface != null) { if (tagName.equals("title")) { parserInterface.setTitle(node); } else if (tagName.equals("a")) { if (allAnchorNodes.size() < MAX_LINKS_PER_PAGE) { AElement attrNode = new AElement(node, purl); allAnchorNodes.add(attrNode); } // This call is performed during the second parse while generating containers and extracting metadata // htmlType.newAHref(attributesMap); } } else if (tagName.equals("i")) { parserInterface.setItalic(true); } else if (tagName.equals("b")) { parserInterface.setBold(true); } //We need to delete a link to the file write part at the end -- EUNYEE } protected void printEndTag(Node node) { String tag = node.getNodeName().toLowerCase(); if (parserInterface != null) { if (tag.equals("i")) parserInterface.setItalic(false); else if (tag.equals("b")) parserInterface.setBold(false); // Create a new Paragraph text based on these tags // TODO add more tags that we should define as starting of a new paragraph -- eunyee if (tag.equals("p") || tag.equals("br") || tag.equals("td") || tag.equals("div") || tag.equals("li") || tag.equals("a") || tag.equals("option") || (tag.length() == 2 && tag.startsWith("h"))) { closeBlock(node); } } } private void closeBlock(Node blockNode) { addCompletedPara(blockNode); currentParagraphText = new ParagraphText(); totalTxtLength = 0; } protected void tagText(byte[] textarray, int start, int end, Node node) { if (textarray != null && textarray.length > 0) { if (!(currentNode.getParentNode().getAttributes().getNamedItem("style") != null)) { while (Character.isWhitespace((char) textarray[start]) && (start < end - 1)) { start++; } while (Character.isWhitespace((char) textarray[end - 1]) && (start < end - 1)) { end--; } int length = end - start; if (length > 0 && !(length == 4 && textarray[0] == 'n' && textarray[1] == 'u' && textarray[2] == 'l' && textarray[3] == 'l')) { currentParagraphText.append(textarray, start, end); totalTxtLength += length; currentParagraphText.setNode(node); } } } } /** * Associate an actual paragraph text with the current node if one wasn't already. If appropriate, * add the currentParagraphText to the paragraphTextsMap. (For example, if there aren't too many * already or if this one looks longer than those collected.) Otherwise, recycle the * currentParagraphText. * * @param blockNode */ protected void addCompletedPara(Node blockNode) { Node node = currentNode; if (!currentParagraphText.hasText()) { StringBuilder longestTxtInSubTree = RecognizedDocumentStructure.getLongestTxtinSubTree( blockNode, null); if (longestTxtInSubTree != null) { if (longestTxtInSubTree.length() > PARA_TEXT_LENGTH_LIMIT) { currentParagraphText.setNode(blockNode); currentParagraphText.setBuffy(longestTxtInSubTree); node = blockNode; } else StringBuilderUtils.release(longestTxtInSubTree); } } if (currentParagraphText.hasText()) { int length = currentParagraphText.length(); /* * Only keeps 10 paragraph texts. Thus, if there is a new paragraph text coming in and the 10 * slots have been already filled, we replace with the existing one based on the length of the * text. */ if (paragraphTextsTMap.size() > 10) { Integer tkey = paragraphTextsTMap.firstKey(); if (tkey.intValue() < totalTxtLength) { ParagraphText removed = paragraphTextsTMap.remove(tkey); removed.recycle(); paragraphTextsTMap.put(totalTxtLength, currentParagraphText); } else currentParagraphText.recycle(); } // We don't put the text into the paragraphTexts structure unless the text is over certain // length and not surrounded by <a> else if ((length > PARA_TEXT_LENGTH_LIMIT) && !underAHref(node) && node.getNodeType() != Node.COMMENT_NODE && (!(node.getNodeName().toLowerCase().equals("script")) || (node.getNodeName().toLowerCase().equals("style")))) { // FIXME -- look out for duplicates introduced by getLongestTxtinSubTree() above paragraphTextsTMap.put(length, currentParagraphText); } else currentParagraphText.recycle(); } else currentParagraphText.recycle(); } public boolean underAHref(Node node) { if ((node.getParentNode().getParentNode().getNodeName().equals("a")) || (node.getParentNode().getNodeName().equals("a"))) { return true; } return false; } int startID(String idValue) { String startID = idValue.substring(0, idValue.indexOf('_')); int sID = Integer.parseInt(startID); return sID; } int endID(String idValue) { String endID = idValue.substring(idValue.indexOf('_') + 1); int eID = Integer.parseInt(endID); return eID; } void checkInPartitionID(Node node, int wordSize, int aWordSize) { NamedNodeMap attributes = node.getParentNode().getAttributes(); String nodeID = attributes.getNamedItem("tag_id").getNodeValue(); String data = ""; if ((startID(nodeID) >= startID(partitionID)) && (endID(nodeID) <= endID(partitionID))) data = nodeID + ", " + wordSize + ", " + aWordSize + ", " + "inform" + "\n"; else data = nodeID + ", " + wordSize + ", " + aWordSize + ", " + "non_inform" + "\n"; try { fileOutputStream.write(data.getBytes()); } catch (IOException e) { e.printStackTrace(); } } public synchronized void recycle() { if (paragraphTextsTMap != null) { for (ParagraphText pt : paragraphTextsTMap.values()) { pt.recycle(); } paragraphTextsTMap.clear(); paragraphTextsTMap = null; } recycle(allImgNodes); allImgNodes = null; recycle(allAnchorNodes); allAnchorNodes = null; currentNode = null; } private static void recycle(Collection<? extends HTMLElementDOM> nodeCollection) { if (nodeCollection != null) { for (HTMLElementDOM thatNode : nodeCollection) thatNode.recycle(); } } public static StringBuilder getTextInSubTree(Node node, boolean recurse) { return getTextInSubTree(node, recurse, null, false, false); } /** * Non-recursive method to get the text for the <code>node</code> Collects the text even if the * node contains other nodes in between, specifically the <code>anchor</code>. It does not however * include the text from the anchor node. * * @param node * @param appendNewline TODO * @param te * @return */ // FIXME -- why is text` in anchor node not included? public static StringBuilder getTextInSubTree(Node node, boolean recurse, StringBuilder result, boolean appendNewline, boolean ignoreAltText) { NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node childNode = children.item(i); if ((recurse && childNode.hasChildNodes()) && (!childNode.getNodeName().toLowerCase().equals("script")) && (!childNode.getNodeName().toLowerCase().equals("style"))) { result = getTextInSubTree(childNode, true, result, appendNewline, ignoreAltText); } else if (childNode.getNodeType() == Node.TEXT_NODE) { int length = 0; if (result != null) { result.append(' '); length = result.length(); } result = StringBuilderUtils.trimAndDecodeUTF8(result, childNode, 0, true); if (result != null) { if (length == result.length()) result.setLength(length - 1); else if (appendNewline) result.append('\n'); } } //images now alt text to the caption of the image else if (!ignoreAltText && childNode.getNodeName().toLowerCase().equals("img")) { NamedNodeMap attributes = childNode.getAttributes(); Node altAtt = attributes.getNamedItem(ALT); String alt = (altAtt != null) ? altAtt.getNodeValue() : null; if (!ImageFeatures.altIsBogus(alt)) { if (result == null) result = StringBuilderUtils.acquire(); else result.append(' '); result.append(alt); } } } if (result != null) XMLTools.unescapeXML(result); return result; } public static StringBuilder getStringBuilder(byte[] bytes, int offset, int length) { try { return new StringBuilder(new String(bytes, offset, length, "UTF8")); } catch (UnsupportedEncodingException e) { throw new Error("UTF-8 to string conversion failed: " + e.getMessage()); } } public int getTotalTxtLength() { return totalTxtLength; } public ArrayList<ImgElement> getAllImgNodes() { return allImgNodes; } public ArrayList<AElement> getAllAnchorNodes() { return allAnchorNodes; } public TreeMap<Integer, ParagraphText> getParagraphTextsTMap() { return paragraphTextsTMap; } public void setPartitionID(String partitionID) { this.partitionID = partitionID; } public void setFileOutputStream(FileOutputStream fos) { this.fileOutputStream = fos; } }