package ecologylab.bigsemantics.html;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.TreeMap;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ecologylab.bigsemantics.html.documentstructure.ImageFeatures;
import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure;
import ecologylab.bigsemantics.html.utils.HTMLNames;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.XMLTools;
/**
* Walking through DOM and tag necessary information
*
* Extends Jtidy's PPrint object to keep state necessary for image+text surrogate extraction
*
* @author eunyee
*
*/
public class DOMWalkInformationTagger implements HTMLNames
{
protected static final int MAX_LINKS_PER_PAGE = 200;
protected static final int PARA_TEXT_LENGTH_LIMIT = 80;
DOMParserInterface parserInterface;
ParsedURL purl;
int encoding;
int state;
/**
* Current DOM node that is being processed
*/
protected org.w3c.dom.Node currentNode = null;
/**
* Keep track of the text length in this page to recognize the page type.
*/
protected int totalTxtLength = 0;
/**
* Collection of text elements until a block level element is reached
*/
protected ParagraphText currentParagraphText = new ParagraphText();
/**
* Keep the array of the paragraph texts in the article body.
*
*/
private TreeMap<Integer, ParagraphText> paragraphTextsTMap = new TreeMap<Integer, ParagraphText>();
/**
* All images in the page
*/
protected ArrayList<ImgElement> allImgNodes = new ArrayList<ImgElement>();
/**
* All links in current page
*/
protected ArrayList<AElement> allAnchorNodes = new ArrayList<AElement>();
private int[] linebuf;
private int lbufsize;
private int linelen;
String partitionID = "";
FileOutputStream fileOutputStream = null;
public DOMWalkInformationTagger(ParsedURL purl, DOMParserInterface parserInterface)
{
this.purl = purl;
this.parserInterface = parserInterface;
}
public void generateCollections(Node doc)
{
Element root = null;
NodeList list = doc.getChildNodes();
for (int i=0; i<list.getLength(); i++)
{
if (list.item(i) instanceof Element)
{
root = (Element)list.item(i);
break;
}
}
generateCollectionsFromRoot(root);
}
public void generateCollectionsFromRoot(Node root)
{
tagTree(root);
}
public void tagTree(Node node)
{
Node content;
currentNode = node;
if (node == null)
return;
short nodeType = node.getNodeType();
switch (nodeType)
{
case Node.TEXT_NODE:
if (node.getNodeValue() != null && node.getNodeValue().length() > 0)
tagText(node.getNodeValue().getBytes(), 0, node.getNodeValue().length(), node);
break;
case Node.ELEMENT_NODE:
if (!node.getNodeName().toLowerCase().equals("script") && (!node.getNodeName().toLowerCase().equals("style")))
{
printTag(node);
NodeList children = node.getChildNodes();
for (int i=0; i<children.getLength(); i++)
{
Node child = children.item(i);
tagTree(child);
}
printEndTag(node);
}
break;
case Node.DOCUMENT_NODE:
NodeList children = node.getChildNodes();
for (int i=0; i<children.getLength(); i++)
{
content = children.item(i);
tagTree(content);
}
break;
default:
printTag(node);
NodeList children2 = node.getChildNodes();
for (int i=0; i<children2.getLength(); i++)
{
Node child = children2.item(i);
tagTree(child);
}
printEndTag(node);
}
}
public void printTag(Node node)
{
String tagName = node.getNodeName().toLowerCase();
if (tagName.equals("img"))
{
ImgElement imgElement = new ImgElement(node, purl);
// TODO confirm that we are happy only collecting images that seem informative
if (imgElement.isInformativeImage())
allImgNodes.add(imgElement);
}
else if (tagName.equals("base"))
{
Node baseHrefAttr = node.getAttributes().getNamedItem("href");
String baseHref = (baseHrefAttr == null) ? null : baseHrefAttr.getNodeValue();
if (baseHref != null)
purl = (purl == null) ? ParsedURL.getAbsolute(baseHref) : purl.getRelative(baseHref);
}
else if (parserInterface != null)
{
if (tagName.equals("title"))
{
parserInterface.setTitle(node);
}
else if (tagName.equals("a"))
{
if (allAnchorNodes.size() < MAX_LINKS_PER_PAGE)
{
AElement attrNode = new AElement(node, purl);
allAnchorNodes.add(attrNode);
}
// This call is performed during the second parse while generating containers and extracting metadata
// htmlType.newAHref(attributesMap);
}
}
else if (tagName.equals("i"))
{
parserInterface.setItalic(true);
}
else if (tagName.equals("b"))
{
parserInterface.setBold(true);
}
//We need to delete a link to the file write part at the end -- EUNYEE
}
protected void printEndTag(Node node)
{
String tag = node.getNodeName().toLowerCase();
if (parserInterface != null)
{
if (tag.equals("i"))
parserInterface.setItalic(false);
else if (tag.equals("b"))
parserInterface.setBold(false);
// Create a new Paragraph text based on these tags
// TODO add more tags that we should define as starting of a new paragraph -- eunyee
if (tag.equals("p") || tag.equals("br") || tag.equals("td") || tag.equals("div")
|| tag.equals("li") || tag.equals("a") || tag.equals("option")
|| (tag.length() == 2 && tag.startsWith("h")))
{
closeBlock(node);
}
}
}
private void closeBlock(Node blockNode)
{
addCompletedPara(blockNode);
currentParagraphText = new ParagraphText();
totalTxtLength = 0;
}
protected void tagText(byte[] textarray, int start, int end, Node node)
{
if (textarray != null && textarray.length > 0)
{
if (!(currentNode.getParentNode().getAttributes().getNamedItem("style") != null))
{
while (Character.isWhitespace((char) textarray[start]) && (start < end - 1))
{
start++;
}
while (Character.isWhitespace((char) textarray[end - 1]) && (start < end - 1))
{
end--;
}
int length = end - start;
if (length > 0
&& !(length == 4 && textarray[0] == 'n' && textarray[1] == 'u' && textarray[2] == 'l' && textarray[3] == 'l'))
{
currentParagraphText.append(textarray, start, end);
totalTxtLength += length;
currentParagraphText.setNode(node);
}
}
}
}
/**
* Associate an actual paragraph text with the current node if one wasn't already. If appropriate,
* add the currentParagraphText to the paragraphTextsMap. (For example, if there aren't too many
* already or if this one looks longer than those collected.) Otherwise, recycle the
* currentParagraphText.
*
* @param blockNode
*/
protected void addCompletedPara(Node blockNode)
{
Node node = currentNode;
if (!currentParagraphText.hasText())
{
StringBuilder longestTxtInSubTree = RecognizedDocumentStructure.getLongestTxtinSubTree(
blockNode, null);
if (longestTxtInSubTree != null)
{
if (longestTxtInSubTree.length() > PARA_TEXT_LENGTH_LIMIT)
{
currentParagraphText.setNode(blockNode);
currentParagraphText.setBuffy(longestTxtInSubTree);
node = blockNode;
}
else
StringBuilderUtils.release(longestTxtInSubTree);
}
}
if (currentParagraphText.hasText())
{
int length = currentParagraphText.length();
/*
* Only keeps 10 paragraph texts. Thus, if there is a new paragraph text coming in and the 10
* slots have been already filled, we replace with the existing one based on the length of the
* text.
*/
if (paragraphTextsTMap.size() > 10)
{
Integer tkey = paragraphTextsTMap.firstKey();
if (tkey.intValue() < totalTxtLength)
{
ParagraphText removed = paragraphTextsTMap.remove(tkey);
removed.recycle();
paragraphTextsTMap.put(totalTxtLength, currentParagraphText);
}
else
currentParagraphText.recycle();
}
// We don't put the text into the paragraphTexts structure unless the text is over certain
// length and not surrounded by <a>
else if ((length > PARA_TEXT_LENGTH_LIMIT) && !underAHref(node) && node.getNodeType() != Node.COMMENT_NODE && (!(node.getNodeName().toLowerCase().equals("script")) || (node.getNodeName().toLowerCase().equals("style"))))
{
// FIXME -- look out for duplicates introduced by getLongestTxtinSubTree() above
paragraphTextsTMap.put(length, currentParagraphText);
}
else
currentParagraphText.recycle();
}
else
currentParagraphText.recycle();
}
public boolean underAHref(Node node)
{
if ((node.getParentNode().getParentNode().getNodeName().equals("a"))
|| (node.getParentNode().getNodeName().equals("a")))
{
return true;
}
return false;
}
int startID(String idValue)
{
String startID = idValue.substring(0, idValue.indexOf('_'));
int sID = Integer.parseInt(startID);
return sID;
}
int endID(String idValue)
{
String endID = idValue.substring(idValue.indexOf('_') + 1);
int eID = Integer.parseInt(endID);
return eID;
}
void checkInPartitionID(Node node, int wordSize, int aWordSize)
{
NamedNodeMap attributes = node.getParentNode().getAttributes();
String nodeID = attributes.getNamedItem("tag_id").getNodeValue();
String data = "";
if ((startID(nodeID) >= startID(partitionID)) && (endID(nodeID) <= endID(partitionID)))
data = nodeID + ", " + wordSize + ", " + aWordSize + ", " + "inform" + "\n";
else
data = nodeID + ", " + wordSize + ", " + aWordSize + ", " + "non_inform" + "\n";
try
{
fileOutputStream.write(data.getBytes());
}
catch (IOException e)
{
e.printStackTrace();
}
}
public synchronized void recycle()
{
if (paragraphTextsTMap != null)
{
for (ParagraphText pt : paragraphTextsTMap.values())
{
pt.recycle();
}
paragraphTextsTMap.clear();
paragraphTextsTMap = null;
}
recycle(allImgNodes);
allImgNodes = null;
recycle(allAnchorNodes);
allAnchorNodes = null;
currentNode = null;
}
private static void recycle(Collection<? extends HTMLElementDOM> nodeCollection)
{
if (nodeCollection != null)
{
for (HTMLElementDOM thatNode : nodeCollection)
thatNode.recycle();
}
}
public static StringBuilder getTextInSubTree(Node node, boolean recurse)
{
return getTextInSubTree(node, recurse, null, false, false);
}
/**
* Non-recursive method to get the text for the <code>node</code> Collects the text even if the
* node contains other nodes in between, specifically the <code>anchor</code>. It does not however
* include the text from the anchor node.
*
* @param node
* @param appendNewline TODO
* @param te
* @return
*/
// FIXME -- why is text` in anchor node not included?
public static StringBuilder getTextInSubTree(Node node, boolean recurse, StringBuilder result, boolean appendNewline, boolean ignoreAltText)
{
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++)
{
Node childNode = children.item(i);
if ((recurse && childNode.hasChildNodes())
&& (!childNode.getNodeName().toLowerCase().equals("script")) && (!childNode.getNodeName().toLowerCase().equals("style")))
{
result = getTextInSubTree(childNode, true, result, appendNewline, ignoreAltText);
}
else if (childNode.getNodeType() == Node.TEXT_NODE)
{
int length = 0;
if (result != null)
{
result.append(' ');
length = result.length();
}
result = StringBuilderUtils.trimAndDecodeUTF8(result, childNode, 0, true);
if (result != null)
{
if (length == result.length())
result.setLength(length - 1);
else if (appendNewline)
result.append('\n');
}
}
//images now alt text to the caption of the image
else if (!ignoreAltText && childNode.getNodeName().toLowerCase().equals("img"))
{
NamedNodeMap attributes = childNode.getAttributes();
Node altAtt = attributes.getNamedItem(ALT);
String alt = (altAtt != null) ? altAtt.getNodeValue() : null;
if (!ImageFeatures.altIsBogus(alt))
{
if (result == null)
result = StringBuilderUtils.acquire();
else
result.append(' ');
result.append(alt);
}
}
}
if (result != null)
XMLTools.unescapeXML(result);
return result;
}
public static StringBuilder getStringBuilder(byte[] bytes, int offset, int length)
{
try
{
return new StringBuilder(new String(bytes, offset, length, "UTF8"));
}
catch (UnsupportedEncodingException e)
{
throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
}
}
public int getTotalTxtLength()
{
return totalTxtLength;
}
public ArrayList<ImgElement> getAllImgNodes()
{
return allImgNodes;
}
public ArrayList<AElement> getAllAnchorNodes()
{
return allAnchorNodes;
}
public TreeMap<Integer, ParagraphText> getParagraphTextsTMap()
{
return paragraphTextsTMap;
}
public void setPartitionID(String partitionID)
{
this.partitionID = partitionID;
}
public void setFileOutputStream(FileOutputStream fos)
{
this.fileOutputStream = fos;
}
}