package com.dappit.Dapper.parser; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.commoncrawl.util.Tuples; import org.commoncrawl.util.Tuples.Pair; import org.w3c.dom.Document; import com.google.common.collect.Sets; public class LinkExtractionDocumentBuilder implements DocumentBuilder { static abstract class Node { public NodeWithChildren parentNode; public int nodePosition = -1; public Node(NodeWithChildren parentNode) { this.parentNode = parentNode; } public abstract boolean isTextNode(); public boolean isLeafNode() { return true; } public abstract String nodeName(); } static class TextNode extends Node { public TextNode(NodeWithChildren parentNode,String text) { super(parentNode); this.text = text; } public String text; @Override public boolean isTextNode() { return true; } public String nodeName() { return "text"; } @Override public String toString() { return "textNode:" + text; }; } static class NodeWithAttributes extends Node { public NodeWithAttributes(NodeWithChildren parentNode,String nodeName) { super(parentNode); this.nodeName = nodeName; } public String nodeName() { return nodeName; } public String nodeName; public LinkedList<Pair<String,String>> attributes = null; public void addAttribute(String attributeName,String attributeValue) { if (attributes == null) attributes = new LinkedList<Tuples.Pair<String,String>>(); attributes.add(new Pair(attributeName,attributeValue)); } public String getAttribute(String name) { if (attributes != null) { for (Pair<String,String> attribute : attributes) { if (attribute.e0.equals(name)) { return attribute.e1; } } } return ""; } @Override public boolean isTextNode() { return false; } @Override public String toString() { return "LeafNode:" + nodeName; } } static class NodeWithChildren extends NodeWithAttributes { public NodeWithChildren(NodeWithChildren parentNode,String nodeName) { super(parentNode,nodeName); } public ArrayList<Node> childNodes = null; public int anchorCount = 0; public int imgCount =0; public int brCount =0; public void addNode(Node node) { if (childNodes == null) childNodes = new ArrayList<LinkExtractionDocumentBuilder.Node>(); childNodes.add(node); node.nodePosition = childNodes.size() - 1; String nodeName = node.nodeName(); if (nodeName.equals("br")) { this.brCount++; } else if (nodeName.equals("a")) { this.anchorCount++; } else if (nodeName.equals("img")) { this.imgCount ++; } } @Override public boolean isLeafNode() { return false; } @Override public String toString() { return "BlockNode:" + nodeName; } public int nestingLevel = 0; public int leafsWithURLCount = 0; } public static class LinkExtractionContext { NodeWithChildren activeExtractionNode = null; public NodeWithAttributes activeLeafNode = null; } static Set<String> trackedNonLeafNodes = Sets.newHashSet( "p","div","tr","td","th","table","a","li","dt","dd", "h1","h2","h3","h4","h5","h6" ); static Set<String> trackedLeafNodes = Sets.newHashSet("br","link","img"); @Override public Document buildDocument(InstructionsPool instructionsPool,FileOutputStream optionalOutputStream) throws IOException { System.out.println("iterating parse instructions"); LinkExtractionContext context = new LinkExtractionContext(); List<Integer> operations = instructionsPool.operations; List<String> arguments = instructionsPool.arguments; for (int i=0; i<operations.size(); i++) { int domOperation = operations.get(i); String domArgument = arguments.get(i); //System.out.println("Operation :" + ParserInstruction.getOperationString(domOperation)+" Arg:~" + domArgument+"~"); switch (domOperation) { // Open node : case ParserInstruction.OpenNode: { String tagName = domArgument.toLowerCase(); if (trackedNonLeafNodes.contains(tagName)) { System.out.println("Found TrackedNode:" + tagName); NodeWithChildren node = new NodeWithChildren(context.activeExtractionNode,tagName); if (context.activeExtractionNode != null) context.activeExtractionNode.addNode(node); context.activeExtractionNode = node; } else { if (context.activeExtractionNode != null) context.activeExtractionNode.nestingLevel++; } } break; // Close node : case ParserInstruction.CloseNode:{ NodeWithChildren activeNode = context.activeExtractionNode; if (activeNode != null) { if (activeNode.nestingLevel != 0) { activeNode.nestingLevel--; } else { //if (activeNode.leafsWithURLCount != 0) { extractLinksFromNode(activeNode); //} // pop stack context.activeExtractionNode = activeNode.parentNode; } } } break; case ParserInstruction.AddText: if (context.activeExtractionNode != null){ String text = domArgument; text = text.replaceAll("\\s{2,}"," "); text = text.trim(); if (text.length() != 0) { context.activeExtractionNode.addNode(new TextNode(context.activeExtractionNode, text)); } } break; case ParserInstruction.AddContent: //System.out.println("AddContent:"+domArgument); break; case ParserInstruction.AddLeaf: { if (context.activeExtractionNode != null) { String tagName = domArgument.toLowerCase(); if (trackedLeafNodes.contains(tagName)) { context.activeLeafNode = new NodeWithAttributes(context.activeExtractionNode, tagName); } } }break; case ParserInstruction.WriteAttributeKey: { String key = domArgument.toLowerCase(); ++i; operations.get(i); String value = arguments.get(i); if (context.activeLeafNode != null) { context.activeLeafNode.addAttribute(key, value); } else if (context.activeExtractionNode != null) { context.activeExtractionNode.addAttribute(key, value); } } break; case ParserInstruction.CloseLeaf: { if (context.activeLeafNode != null) { context.activeExtractionNode.addNode(context.activeLeafNode); context.activeLeafNode = null; } } break; case ParserInstruction.AddEntity: System.out.println("AddEntity:" + domArgument); break; case ParserInstruction.AddComment: //System.out.println("AddComment:" + domArgument); break; case ParserInstruction.SetTitle: { } break; } } return null; } void findTextAroundNode(Node anchorNode,NodeWithChildren parentNode,StringBuffer buffer) { buffer.append("Parent Node:" + parentNode.nodeName + "\nContext Text:"); if (parentNode.childNodes != null) { for (Node child : parentNode.childNodes) { if (child == anchorNode) break; if (child.isTextNode()) { buffer.append(((TextNode)child).text); } } } } void extractLinksFromNode(NodeWithChildren node) { if (node.parentNode == null && node.nodeName.equals("a")){ extractTextFromAnchorNode(node); } else { if (node.childNodes != null) { for (Node childNode : node.childNodes) { if (childNode instanceof NodeWithChildren) { if (((NodeWithChildren)childNode).nodeName.equalsIgnoreCase("a")) { extractTextFromAnchorNode((NodeWithChildren)childNode); } } } } } } void extractTextFromNode(NodeWithChildren node,StringBuffer textBufferOut) { if (node.childNodes != null) { for (Node childNode : node.childNodes) { if (childNode.isTextNode()) { textBufferOut.append(((TextNode)childNode).text + " "); } else if (!childNode.isLeafNode()) { extractTextFromNode((NodeWithChildren)childNode,textBufferOut); } } } } Node findPreviousAnchorOrBreakInParent(NodeWithChildren anchorNode) { for (int i=anchorNode.nodePosition-1;i>=0;--i) { Node prevNode = anchorNode.parentNode.childNodes.get(i); if (prevNode.nodeName().equals("a") || prevNode.nodeName().equals("br") || trackedNonLeafNodes.contains(prevNode.nodeName())) { return prevNode; } } return null; } void extractTextAfterGivenNodeAndBeforeNextAnchorOrBreak( NodeWithChildren parent, Node previousAnchorOrBreak, NodeWithChildren currentAnchor,StringBuffer textBuffer) { int startIndex = (previousAnchorOrBreak != null) ? previousAnchorOrBreak.nodePosition : -1; for (int i=startIndex + 1;i<parent.childNodes.size();++i) { Node nodeAtIndex = parent.childNodes.get(i); if (nodeAtIndex.isTextNode()) { textBuffer.append(((TextNode)nodeAtIndex).text +" "); } else if (nodeAtIndex.isLeafNode()) { if (nodeAtIndex.nodeName().equals("br")) { break; } } else if (!nodeAtIndex.isLeafNode()) { if (nodeAtIndex.nodeName().equals("a") && nodeAtIndex != currentAnchor) { break; } else { extractTextFromNode((NodeWithChildren)nodeAtIndex,textBuffer); } } } } void extractTextFromAnchorNode(NodeWithChildren node){ StringBuffer anchorText = new StringBuffer(); // ok get parent .. first NodeWithChildren parent = node.parentNode; // if parent present .. if (parent != null) { if (parent.brCount == 0 && parent.anchorCount == 1) { extractTextFromNode(parent,anchorText); } else { System.out.println("Looking For Previous Break In Parent"); Node previousAnchorOrBreak = findPreviousAnchorOrBreakInParent(node); System.out.println("Found:" + previousAnchorOrBreak); System.out.println("Extracting Text Given Previous Node"); extractTextAfterGivenNodeAndBeforeNextAnchorOrBreak(parent,previousAnchorOrBreak,node,anchorText); } } else { extractTextFromNode(parent,anchorText); } System.out.println("Anchor Text:" + anchorText.toString() + " href:" + node.getAttribute("href")); } }