/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;
import java.util.zip.CRC32;
import java.util.zip.CheckedInputStream;
import java.util.zip.CheckedOutputStream;
import org.apache.hadoop.io.WritableUtils;
import org.commoncrawl.util.BitUtils.BitStream;
import org.commoncrawl.util.BitUtils.BitStreamReader;
import org.commoncrawl.util.RiceCoding.RiceCodeReader;
import org.commoncrawl.util.Tuples.Pair;
import org.w3c.dom.Attr;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Comment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import com.dappit.Dapper.parser.InstructionsPool;
import com.dappit.Dapper.parser.ParserInstruction;
import com.google.common.collect.ImmutableList;
/**
*
* @author rana
*
*/
public class HTMLDomUtils {
static class TextTokenData {
public TextTokenData(String tokenValue) {
_token = tokenValue;
}
public String _token = null; // the actual token value ...
public int _tokenId = -1; // token id assoicated with this token
public int _tokenInstanceCount = 0; // number of times token appears in document ...
}
static class HTMLTagSupport {
ImmutableList<TextTokenData> _htmlTagList;
ImmutableList<TextTokenData> _htmlAttributeList;
Map<String,TextTokenData> _htmlTagMap;
Map<String,TextTokenData> _htmlAttributeTagMap;
void initializeHTMLTagMaps() {
ImmutableList.Builder<TextTokenData> builder = new ImmutableList.Builder<TextTokenData>();
builder.add(new TextTokenData("href"));
builder.add(new TextTokenData("class"));
builder.add(new TextTokenData("rel"));
builder.add(new TextTokenData("src"));
builder.add(new TextTokenData("id"));
builder.add(new TextTokenData("type"));
builder.add(new TextTokenData("title"));
builder.add(new TextTokenData("style"));
builder.add(new TextTokenData("width"));
builder.add(new TextTokenData("height"));
builder.add(new TextTokenData("alt"));
builder.add(new TextTokenData("abbr"));
builder.add(new TextTokenData("accept"));
builder.add(new TextTokenData("accesskey"));
builder.add(new TextTokenData("action"));
builder.add(new TextTokenData("add"));
builder.add(new TextTokenData("additional"));
builder.add(new TextTokenData("align"));
builder.add(new TextTokenData("alink"));
builder.add(new TextTokenData("archive"));
builder.add(new TextTokenData("autobuffer"));
builder.add(new TextTokenData("autocomplete"));
builder.add(new TextTokenData("autofocus"));
builder.add(new TextTokenData("autoplay"));
builder.add(new TextTokenData("autosave"));
builder.add(new TextTokenData("axis"));
builder.add(new TextTokenData("background"));
builder.add(new TextTokenData("behavior"));
builder.add(new TextTokenData("bgcolor"));
builder.add(new TextTokenData("bgproperties"));
builder.add(new TextTokenData("border"));
builder.add(new TextTokenData("bordercolor"));
builder.add(new TextTokenData("cellborder"));
builder.add(new TextTokenData("cellpadding"));
builder.add(new TextTokenData("cellspacing"));
builder.add(new TextTokenData("challenge"));
builder.add(new TextTokenData("char"));
builder.add(new TextTokenData("charoff"));
builder.add(new TextTokenData("charset"));
builder.add(new TextTokenData("checked"));
builder.add(new TextTokenData("cite"));
builder.add(new TextTokenData("classid"));
builder.add(new TextTokenData("clear"));
builder.add(new TextTokenData("code"));
builder.add(new TextTokenData("codebase"));
builder.add(new TextTokenData("codetype"));
builder.add(new TextTokenData("color"));
builder.add(new TextTokenData("cols"));
builder.add(new TextTokenData("colspan"));
builder.add(new TextTokenData("compact"));
builder.add(new TextTokenData("composite"));
builder.add(new TextTokenData("content"));
builder.add(new TextTokenData("contenteditable"));
builder.add(new TextTokenData("controls"));
builder.add(new TextTokenData("coords"));
builder.add(new TextTokenData("create"));
builder.add(new TextTokenData("curr"));
builder.add(new TextTokenData("data"));
builder.add(new TextTokenData("datetime"));
builder.add(new TextTokenData("declare"));
builder.add(new TextTokenData("defer"));
builder.add(new TextTokenData("dir"));
builder.add(new TextTokenData("direction"));
builder.add(new TextTokenData("disabled"));
builder.add(new TextTokenData("enctype"));
builder.add(new TextTokenData("equiv"));
builder.add(new TextTokenData("face"));
builder.add(new TextTokenData("for"));
builder.add(new TextTokenData("frame"));
builder.add(new TextTokenData("frameborder"));
builder.add(new TextTokenData("get"));
builder.add(new TextTokenData("has"));
builder.add(new TextTokenData("headers"));
builder.add(new TextTokenData("hidden"));
builder.add(new TextTokenData("hreflang"));
builder.add(new TextTokenData("hspace"));
builder.add(new TextTokenData("incremental"));
builder.add(new TextTokenData("insert"));
builder.add(new TextTokenData("ismap"));
builder.add(new TextTokenData("keytype"));
builder.add(new TextTokenData("label"));
builder.add(new TextTokenData("lang"));
builder.add(new TextTokenData("language"));
builder.add(new TextTokenData("leftmargin"));
builder.add(new TextTokenData("link"));
builder.add(new TextTokenData("longdesc"));
builder.add(new TextTokenData("loop"));
builder.add(new TextTokenData("lowsrc"));
builder.add(new TextTokenData("manifest"));
builder.add(new TextTokenData("mapped"));
builder.add(new TextTokenData("marginheight"));
builder.add(new TextTokenData("marginwidth"));
builder.add(new TextTokenData("max"));
builder.add(new TextTokenData("maxlength"));
builder.add(new TextTokenData("mayscript"));
builder.add(new TextTokenData("media"));
builder.add(new TextTokenData("method"));
builder.add(new TextTokenData("min"));
builder.add(new TextTokenData("multiple"));
builder.add(new TextTokenData("name"));
builder.add(new TextTokenData("nohref"));
builder.add(new TextTokenData("noresize"));
builder.add(new TextTokenData("noshade"));
builder.add(new TextTokenData("nowrap"));
builder.add(new TextTokenData("object"));
builder.add(new TextTokenData("onabort"));
builder.add(new TextTokenData("onbeforecopy"));
builder.add(new TextTokenData("onbeforecut"));
builder.add(new TextTokenData("onbeforepaste"));
builder.add(new TextTokenData("onbeforeunload"));
builder.add(new TextTokenData("onblur"));
builder.add(new TextTokenData("onchange"));
builder.add(new TextTokenData("onclick"));
builder.add(new TextTokenData("oncontextmenu"));
builder.add(new TextTokenData("oncopy"));
builder.add(new TextTokenData("oncut"));
builder.add(new TextTokenData("ondblclick"));
builder.add(new TextTokenData("ondrag"));
builder.add(new TextTokenData("ondragend"));
builder.add(new TextTokenData("ondragenter"));
builder.add(new TextTokenData("ondragleave"));
builder.add(new TextTokenData("ondragover"));
builder.add(new TextTokenData("ondragstart"));
builder.add(new TextTokenData("ondrop"));
builder.add(new TextTokenData("onerror"));
builder.add(new TextTokenData("onfocus"));
builder.add(new TextTokenData("oninput"));
builder.add(new TextTokenData("onkeydown"));
builder.add(new TextTokenData("onkeypress"));
builder.add(new TextTokenData("onkeyup"));
builder.add(new TextTokenData("onload"));
builder.add(new TextTokenData("onmousedown"));
builder.add(new TextTokenData("onmousemove"));
builder.add(new TextTokenData("onmouseout"));
builder.add(new TextTokenData("onmouseover"));
builder.add(new TextTokenData("onmouseup"));
builder.add(new TextTokenData("onmousewheel"));
builder.add(new TextTokenData("onpaste"));
builder.add(new TextTokenData("onreset"));
builder.add(new TextTokenData("onresize"));
builder.add(new TextTokenData("onscroll"));
builder.add(new TextTokenData("onsearch"));
builder.add(new TextTokenData("onselect"));
builder.add(new TextTokenData("onselectstart"));
builder.add(new TextTokenData("onstorage"));
builder.add(new TextTokenData("onsubmit"));
builder.add(new TextTokenData("onunload"));
builder.add(new TextTokenData("poster"));
builder.add(new TextTokenData("precision"));
builder.add(new TextTokenData("process"));
builder.add(new TextTokenData("profile"));
builder.add(new TextTokenData("prompt"));
builder.add(new TextTokenData("readonly"));
builder.add(new TextTokenData("remove"));
builder.add(new TextTokenData("results"));
builder.add(new TextTokenData("rev"));
builder.add(new TextTokenData("rows"));
builder.add(new TextTokenData("rowspan"));
builder.add(new TextTokenData("rules"));
builder.add(new TextTokenData("scheme"));
builder.add(new TextTokenData("scope"));
builder.add(new TextTokenData("scrollamount"));
builder.add(new TextTokenData("scrolldelay"));
builder.add(new TextTokenData("scrolling"));
builder.add(new TextTokenData("selected"));
builder.add(new TextTokenData("set"));
builder.add(new TextTokenData("shape"));
builder.add(new TextTokenData("size"));
builder.add(new TextTokenData("source"));
builder.add(new TextTokenData("span"));
builder.add(new TextTokenData("standby"));
builder.add(new TextTokenData("start"));
builder.add(new TextTokenData("summary"));
builder.add(new TextTokenData("tabindex"));
builder.add(new TextTokenData("tableborder"));
builder.add(new TextTokenData("target"));
builder.add(new TextTokenData("text"));
builder.add(new TextTokenData("topmargin"));
builder.add(new TextTokenData("truespeed"));
builder.add(new TextTokenData("usemap"));
builder.add(new TextTokenData("valign"));
builder.add(new TextTokenData("value"));
builder.add(new TextTokenData("valuetype"));
builder.add(new TextTokenData("version"));
builder.add(new TextTokenData("viewsource"));
builder.add(new TextTokenData("vlink"));
builder.add(new TextTokenData("vspace"));
builder.add(new TextTokenData("wrap"));
_htmlAttributeList = builder.build();
_htmlAttributeTagMap = new HashMap<String, TextTokenData>();
for (TextTokenData token : _htmlAttributeList) {
_htmlAttributeTagMap.put(token._token, token);
}
builder = new ImmutableList.Builder<TextTokenData>();
builder.add(new TextTokenData("div"));
builder.add(new TextTokenData("a"));
builder.add(new TextTokenData("li"));
builder.add(new TextTokenData("script"));
builder.add(new TextTokenData("p"));
builder.add(new TextTokenData("img"));
builder.add(new TextTokenData("ul"));
builder.add(new TextTokenData("br"));
builder.add(new TextTokenData("address"));
builder.add(new TextTokenData("abbr"));
builder.add(new TextTokenData("area"));
builder.add(new TextTokenData("article"));
builder.add(new TextTokenData("aside"));
builder.add(new TextTokenData("audio"));
builder.add(new TextTokenData("b"));
builder.add(new TextTokenData("base"));
builder.add(new TextTokenData("bb"));
builder.add(new TextTokenData("bdo"));
builder.add(new TextTokenData("blockquote"));
builder.add(new TextTokenData("body"));
builder.add(new TextTokenData("button"));
builder.add(new TextTokenData("canvas"));
builder.add(new TextTokenData("caption"));
builder.add(new TextTokenData("cite"));
builder.add(new TextTokenData("code"));
builder.add(new TextTokenData("col"));
builder.add(new TextTokenData("colgroup"));
builder.add(new TextTokenData("command"));
builder.add(new TextTokenData("datagrid"));
builder.add(new TextTokenData("datalist"));
builder.add(new TextTokenData("dd"));
builder.add(new TextTokenData("del"));
builder.add(new TextTokenData("details"));
builder.add(new TextTokenData("dialog"));
builder.add(new TextTokenData("dfn"));
builder.add(new TextTokenData("dl"));
builder.add(new TextTokenData("dt"));
builder.add(new TextTokenData("em"));
builder.add(new TextTokenData("embed"));
builder.add(new TextTokenData("eventsource"));
builder.add(new TextTokenData("fieldset"));
builder.add(new TextTokenData("figure"));
builder.add(new TextTokenData("footer"));
builder.add(new TextTokenData("form"));
builder.add(new TextTokenData("h1"));
builder.add(new TextTokenData("h2"));
builder.add(new TextTokenData("h3"));
builder.add(new TextTokenData("h4"));
builder.add(new TextTokenData("h5"));
builder.add(new TextTokenData("h6"));
builder.add(new TextTokenData("head"));
builder.add(new TextTokenData("header"));
builder.add(new TextTokenData("hr"));
builder.add(new TextTokenData("html"));
builder.add(new TextTokenData("i"));
builder.add(new TextTokenData("iframe"));
builder.add(new TextTokenData("input"));
builder.add(new TextTokenData("ins"));
builder.add(new TextTokenData("kbd"));
builder.add(new TextTokenData("label"));
builder.add(new TextTokenData("legend"));
builder.add(new TextTokenData("link"));
builder.add(new TextTokenData("mark"));
builder.add(new TextTokenData("map"));
builder.add(new TextTokenData("menu"));
builder.add(new TextTokenData("meta"));
builder.add(new TextTokenData("meter"));
builder.add(new TextTokenData("nav"));
builder.add(new TextTokenData("noscript"));
builder.add(new TextTokenData("object"));
builder.add(new TextTokenData("ol"));
builder.add(new TextTokenData("optgroup"));
builder.add(new TextTokenData("option"));
builder.add(new TextTokenData("output"));
builder.add(new TextTokenData("param"));
builder.add(new TextTokenData("pre"));
builder.add(new TextTokenData("progress"));
builder.add(new TextTokenData("q"));
builder.add(new TextTokenData("ruby"));
builder.add(new TextTokenData("rp"));
builder.add(new TextTokenData("rt"));
builder.add(new TextTokenData("samp"));
builder.add(new TextTokenData("section"));
builder.add(new TextTokenData("select"));
builder.add(new TextTokenData("small"));
builder.add(new TextTokenData("source"));
builder.add(new TextTokenData("span"));
builder.add(new TextTokenData("strong"));
builder.add(new TextTokenData("style"));
builder.add(new TextTokenData("sub"));
builder.add(new TextTokenData("sup"));
builder.add(new TextTokenData("table"));
builder.add(new TextTokenData("tbody"));
builder.add(new TextTokenData("td"));
builder.add(new TextTokenData("textarea"));
builder.add(new TextTokenData("tfoot"));
builder.add(new TextTokenData("th"));
builder.add(new TextTokenData("thead"));
builder.add(new TextTokenData("time"));
builder.add(new TextTokenData("title"));
builder.add(new TextTokenData("tr"));
builder.add(new TextTokenData("var"));
builder.add(new TextTokenData("video"));
_htmlTagList = builder.build();
_htmlTagMap = new HashMap<String, TextTokenData>();
for (TextTokenData token : _htmlTagList) {
_htmlTagMap.put(token._token, token);
}
}
}
public static class HTMLEncodedDocumentReader extends HTMLTagSupport {
public static int MIN_SUPPORTED_VERSION = 1;
public static int MAX_SUPPORTED_VERSION = 1;
private ContentHandler _contentHandler;
private RiceCodeReader _textTokenLengthsCoder;
private RiceCodeReader _textTokenStreamCoder;
private RiceCodeReader _hrefTokenLengthsCoder;
private RiceCodeReader _hrefTokenStreamCoder;
private RiceCodeReader _attributesTokenStreamCoder;
private RiceCodeReader _attributesTokenLengthsCoder;
//private BitStreamReader _elementTokenStream;
private ByteArrayInputStream _elementInputStream;
private DataInputStream _elementDataStream;
private BitStreamReader _htmlTagMaskStream;
private BitStreamReader _htmlAttributeMaskStream;
private Map<Integer,String> _htmlTokenToTagMap;
private Map<Integer,String> _htmlAttributeTokenToTagMap;
private Map<Integer,String> _textTokenMap;
private Map<Integer,String> _hrefTokenMap;
private Map<Integer,String> _attributesTokenMap;
/**
* initialize a compressed document reader
*
* @param contentHandler - a SAX1 Content Handler Interface
*/
public HTMLEncodedDocumentReader(ContentHandler contentHandler) {
_contentHandler = contentHandler;
}
/**
* Read an encoded html document. Generate SAX events via content handler as appropriate
* @param inputStream
* @throws IOException
*/
public void readDocument(InputStream inputStream) throws IOException {
// clear state ...
resetState();
long timeStart = System.currentTimeMillis();
//
DataInputStream fileDataInputStream = new DataInputStream(inputStream);
// read version
int version = inputStream.read();
if (version < MIN_SUPPORTED_VERSION || version > MAX_SUPPORTED_VERSION) {
throw new IOException("Unsupported File Version!");
}
// read crc
long crcValue = WritableUtils.readVLong(fileDataInputStream);
// and buffer length ...
int bufferLength = (int) WritableUtils.readVLong(fileDataInputStream);
System.out.println("CRCValue is:" + crcValue + " DataBufferLen:" + bufferLength);
// allocate buffer
byte[] dataBuffer = new byte[bufferLength];
// read it in
CRC32 crcCalculator = new CRC32();
CheckedInputStream crcInputStream = new CheckedInputStream(inputStream,crcCalculator);
if (crcInputStream.read(dataBuffer) != dataBuffer.length) {
throw new IOException("Underflow Error!");
}
// validate crc
if (crcCalculator.getValue() != crcValue) {
System.out.println("StreamCRC:" + crcValue + " CalculatedCRC:" + crcCalculator.getValue());
throw new IOException("CRC Mismatch Detected!");
}
// ok we move further
ByteArrayInputStream byteStream = new ByteArrayInputStream(dataBuffer);
DataInputStream byteDataStream = new DataInputStream(byteStream);
// read mask stream sizes
int htmlTagsMaskBits = WritableUtils.readVInt(byteDataStream);
int attributeTagMaskBits = WritableUtils.readVInt(byteDataStream);
// allocate buffers
byte[] htmlTagMaskBuffer = new byte[(htmlTagsMaskBits + 7)/8];
byte[] attributesMaskBuffer = new byte[(attributeTagMaskBits + 7)/8];
// read buffers
byteStream.read(htmlTagMaskBuffer);
byteStream.read(attributesMaskBuffer);
System.out.println("Initilize Tag Maps");
// initialize mask streams
_htmlTagMaskStream = new BitUtils.BitStreamReader(new BitStream(htmlTagMaskBuffer,htmlTagsMaskBits));
_htmlAttributeMaskStream = new BitUtils.BitStreamReader(new BitStream(attributesMaskBuffer,attributeTagMaskBits));
System.out.println("Assigning Token Ids");
// assign html tag ids
assignHTMLTokenIds();
// ok read in text buffer lengths
int textDataBufferLen = WritableUtils.readVInt(byteDataStream);
int hrefDataBufferLen = WritableUtils.readVInt(byteDataStream);
int attributeDataBufferLen = WritableUtils.readVInt(byteDataStream);
int compressedBufferLen = WritableUtils.readVInt(byteDataStream);
// allocate buffer
byte [] compressedTextBuffer = new byte[compressedBufferLen];
System.out.println("Reading Compressed Text Buffer");
// read
byteStream.read(compressedTextBuffer);
// uncompress
byte[] uncompressedBuffer = GZIPUtils.unzip(compressedTextBuffer);
// ok, we have to read the token length streams to figure out how to assign token ids to them
_textTokenLengthsCoder = initializeRiceCodeReader(byteDataStream);
_hrefTokenLengthsCoder = initializeRiceCodeReader(byteDataStream);
_attributesTokenLengthsCoder = initializeRiceCodeReader(byteDataStream);
// build token maps
System.out.println("Reading Text Tokens");
_textTokenMap = buildTokenIdToTextMap(uncompressedBuffer,0,textDataBufferLen,_textTokenLengthsCoder);
System.out.println("Reading HREF Text Tokens");
_hrefTokenMap = buildTokenIdToTextMap(uncompressedBuffer,textDataBufferLen,hrefDataBufferLen,_hrefTokenLengthsCoder);
System.out.println("Reading Attribute Text Tokens");
_attributesTokenMap = buildTokenIdToTextMap(uncompressedBuffer,textDataBufferLen+hrefDataBufferLen,attributeDataBufferLen,_attributesTokenLengthsCoder);
// and initialize token stream readers
System.out.println("Initializing Token Stream Readers");
_textTokenStreamCoder = initializeRiceCodeReader(byteDataStream);
_hrefTokenStreamCoder = initializeRiceCodeReader(byteDataStream);
_attributesTokenStreamCoder = initializeRiceCodeReader(byteDataStream);
// read element stream data
int elementStreamComprssedSize = WritableUtils.readVInt(byteDataStream);
// int elementStreamNBits = WritableUtils.readVInt(byteDataStream);
byte [] compressedElementStream = new byte[elementStreamComprssedSize];
System.out.println("Reading Compressed Element Stream - Size:" + elementStreamComprssedSize);
byteStream.read(compressedElementStream);
// decompress ...
byte[] decompElementStream = GZIPUtils.unzip(compressedElementStream);
System.out.println("Initializing Element Stream Reader-Bytes In:" + decompElementStream.length);
// allocate a reader
// _elementTokenStream = new BitUtils.BitStreamReader(new BitStream(decompElementStream,elementStreamNBits));
_elementInputStream = new ByteArrayInputStream(decompElementStream);
_elementDataStream = new DataInputStream(_elementInputStream);
long timeEnd = System.currentTimeMillis();
// ok read to roll ...
System.out.println("Ready to Decode Document - Setup Took:" + (timeEnd-timeStart));
timeStart = System.currentTimeMillis();
runDecodeLoop();
timeEnd = System.currentTimeMillis();
System.out.println("Decode Loop Took:" + (timeEnd-timeStart));
}
public String getHTMLAttributeToken(String tokenName) {
for (String token : _htmlAttributeTokenToTagMap.values()) {
if (token.equals(tokenName)) {
return token;
}
}
return null;
}
private void runDecodeLoop() throws IOException{
List<String> elementStack = new ArrayList<String>();
try {
_contentHandler.startDocument();
int availableBytes = _elementInputStream.available();
System.out.println("Available Bytes at ElementInputStream:" + availableBytes);
while (_elementInputStream.available() != 0) {
int elementToken = nextElementStreamInt();
switch (elementToken) {
case HTMLDocumentEncoder.ELEMENT_START_WO_ATTRIBUTES:
case HTMLDocumentEncoder.CUSTOM_ELEMENT_START_WO_ATTRIBUTES:
case HTMLDocumentEncoder.ELEMENT_START_W_ATTRIBUTES:
case HTMLDocumentEncoder.CUSTOM_ELEMENT_START_W_ATTRIBUTES: {
AttributesImpl attributes = new AttributesImpl();
String qName = null;
if (elementToken == HTMLDocumentEncoder.CUSTOM_ELEMENT_START_WO_ATTRIBUTES || elementToken == HTMLDocumentEncoder.CUSTOM_ELEMENT_START_W_ATTRIBUTES) {
qName = _textTokenMap.get((int)_textTokenStreamCoder.nextValue());
}
else {
qName = _htmlTokenToTagMap.get((int)nextElementStreamInt());
}
int attributeCount = 0;
if (elementToken == HTMLDocumentEncoder.ELEMENT_START_W_ATTRIBUTES || elementToken == HTMLDocumentEncoder.CUSTOM_ELEMENT_START_W_ATTRIBUTES) {
attributeCount = nextElementStreamInt();
}
for (int i=0;i<attributeCount;++i) {
String attrQName = null;
String attrValue = "";
int attributeToken = nextElementStreamInt();
if (attributeToken == HTMLDocumentEncoder.CUSTOM_ATTRIBUTE_START_WO_VALUES || attributeToken == HTMLDocumentEncoder.CUSTOM_ATTRIBUTE_START_W_VALUES) {
attrQName = _textTokenMap.get((int)_textTokenStreamCoder.nextValue());
// System.out.println("*** CUSTOM ATTRIBUTE:" + attrQName);
}
else {
int tokenId = nextElementStreamInt();
attrQName = _htmlAttributeTokenToTagMap.get((int)tokenId);
// System.out.println("*** TokenId:" + tokenId + " Resolved to:" + attrQName);
}
// ok now process values ...
if (attributeToken == HTMLDocumentEncoder.ATTRIBUTE_START_W_VALUES || attributeToken == HTMLDocumentEncoder.CUSTOM_ATTRIBUTE_START_W_VALUES) {
if (attrQName.equals("href") || attrQName.equals("src") || attrQName.equals("url")) {
attrValue = _hrefTokenMap.get((int)_hrefTokenStreamCoder.nextValue());
}
else {
attrValue = _attributesTokenMap.get((int)_attributesTokenStreamCoder.nextValue());
}
}
attributes.addAttribute("", "", attrQName,"" , attrValue);
}
// System.out.println("Calling contentHandler startElement with qName:" + qName);
_contentHandler.startElement("", "", qName, attributes);
elementStack.add(qName);
}
break;
case HTMLDocumentEncoder.ELEMENT_END: {
_contentHandler.endElement(null, null, elementStack.remove(elementStack.size()-1));
}
break;
case HTMLDocumentEncoder.TEXT_NODE: {
String text = _textTokenMap.get((int)_textTokenStreamCoder.nextValue());
_contentHandler.characters(text.toCharArray(), 0, text.length());
}
break;
case HTMLDocumentEncoder.COMMENT_NODE: {
throw new IOException("Unexpected Comment Node!");
}
case HTMLDocumentEncoder.CDATA_NODE: {
throw new IOException("Unexpected CData Node!");
}
}
}
_contentHandler.endDocument();
}
catch (SAXException e) {
e.printStackTrace();
throw new IOException(e);
}
}
private Map<Integer,String> buildTokenIdToTextMap(byte[] dataBuffer,int offset,int length,RiceCodeReader tokenLengthsReader) {
Map<Integer,String> mapOut = new HashMap<Integer,String>();
CharBuffer buffer = Charset.forName("UTF8").decode(ByteBuffer.wrap(dataBuffer,offset,length));
int currentLength = 0;
int currentOffset = 0;
int nextTokenId = 1;
while (tokenLengthsReader.hasNext()) {
// read the next length (delta)
currentLength += tokenLengthsReader.nextValue();
// subtract 1 (rice coding limitation)
currentLength -= 1;
// create a subsequence ...
mapOut.put(nextTokenId, buffer.subSequence(currentOffset,currentOffset+currentLength).toString());
// increment offset
currentOffset += currentLength;
// increment token id
nextTokenId++;
}
return mapOut;
}
private RiceCodeReader initializeRiceCodeReader(DataInputStream stream) throws IOException {
int mValue = stream.read();
int numbits = WritableUtils.readVInt(stream);
byte[] dataArray = new byte[(numbits + 7) / 8];
stream.read(dataArray);
return new RiceCodeReader(mValue,numbits,dataArray,0);
}
private void resetState() {
_textTokenLengthsCoder = null;
_textTokenStreamCoder = null;
_hrefTokenLengthsCoder = null;
_hrefTokenStreamCoder = null;
_attributesTokenStreamCoder = null;
_attributesTokenLengthsCoder = null;
// _elementTokenStream = null;
_elementInputStream = null;
_elementDataStream = null;
_htmlTagMaskStream = null;
_htmlAttributeMaskStream = null;
_textTokenMap = null;
_hrefTokenMap = null;
_attributesTokenMap = null;
_htmlTokenToTagMap = new HashMap<Integer,String>();
_htmlAttributeTokenToTagMap = new HashMap<Integer,String>();
initializeHTMLTagMaps();
}
private void assignHTMLTokenIds() {
assignTokenIdsToSet(_htmlTagList,_htmlTagMaskStream,_htmlTokenToTagMap);
assignTokenIdsToSet(_htmlAttributeList,_htmlAttributeMaskStream,_htmlAttributeTokenToTagMap);
}
private void assignTokenIdsToSet(ImmutableList<TextTokenData> tokenSet,BitStreamReader maskStream,Map<Integer,String> tokenToTagMap) {
int nextTokenId=0;
// iterate token set in proper order
for (TextTokenData token : tokenSet) {
// get next mask value in bit strem
int bitValue = maskStream.getbit();
// if 'on', then assign a token id to this tag
if (bitValue == 1) {
token._tokenId = nextTokenId++;
tokenToTagMap.put(token._tokenId,token._token);
//System.out.println("Assigned TokenId:" + token._tokenId + " to Token:" + token._token);
}
}
}
private int nextElementStreamInt()throws IOException {
/*
int nibbleNumber = 0;
int bitNumber = 0;
int result = 0;
boolean done = false;
while (!done) {
int bitValue = _elementTokenStream.getbit();
// if last bit in the nibble
if (bitNumber == 3) {
if (bitValue == 0) {
done = true;
}
else {
bitNumber = 0;
nibbleNumber++;
}
}
else {
if (bitValue != 0) {
result |= 1 << (bitNumber + (nibbleNumber * 3));
}
bitNumber++;
}
}
return result;
*/
return WritableUtils.readVInt(_elementDataStream);
}
}
public static class HTMLDocumentEncoder extends HTMLTagSupport {
public static final int ENCODER_VERSION = 1;
public static final int ELEMENT_START_WO_ATTRIBUTES = 0;
public static final int ELEMENT_START_W_ATTRIBUTES = 1;
public static final int ATTRIBUTE_START_WO_VALUES = 2;
public static final int ATTRIBUTE_START_W_VALUES = 3;
public static final int ELEMENT_END = 4;
public static final int TEXT_NODE = 5;
public static final int COMMENT_NODE = 6;
public static final int CDATA_NODE = 7;
public static final int CUSTOM_ATTRIBUTE_START_WO_VALUES = 8;
public static final int CUSTOM_ATTRIBUTE_START_W_VALUES = 9;
public static final int CUSTOM_ELEMENT_START_WO_ATTRIBUTES = 10;
public static final int CUSTOM_ELEMENT_START_W_ATTRIBUTES = 11;
TreeMap<String,TextTokenData> _textTokenMap = new TreeMap<String,TextTokenData>();
Vector<TextTokenData> _sortedTextTokenArray = new Vector<TextTokenData>();
TextTokenIdData _textTokenIdData;
TreeMap<String,TextTokenData> _hrefTokenMap = new TreeMap<String,TextTokenData>();
Vector<TextTokenData> _sortedHRefTokenArray = new Vector<TextTokenData>();
TextTokenIdData _hrefTokenIdData;
TreeMap<String,TextTokenData> _attributeTokenMap = new TreeMap<String,TextTokenData>();
Vector<TextTokenData> _sortedAttributeTokenArray = new Vector<TextTokenData>();
TextTokenIdData _attributeTokenIdData;
double _averageElementTagId;
int _totalElementTags;
int _elementStreamMValue;
public RiceCoding _textTokenLengthsCoder;
public RiceCoding _textTokenStreamCoder;
public RiceCoding _hrefTokenLengthsCoder;
public RiceCoding _hrefTokenStreamCoder;
public RiceCoding _attributesTokenStreamCoder;
public RiceCoding _attributesTokenLengthsCoder;
public ByteArrayOutputStream _elementTokenStream;
public DataOutputStream _elementDataStream;
public BitStream _htmlTagMaskStream;
public BitStream _htmlAttributeMaskStream;
public int _encodedTextTokensCount = 0;
public int _encodedHREFTokensCount = 0;
public int _encodedAttributeTokensCount = 0;
public int _encodedElementTokensCount = 0;
public int _encodedIntegersCount = 0;
public HTMLDocumentEncoder() {
}
private void initializeEncoder() {
_textTokenMap = new TreeMap<String,TextTokenData>();
_sortedTextTokenArray = new Vector<TextTokenData>();
_textTokenIdData = null;
_hrefTokenMap = new TreeMap<String,TextTokenData>();
_sortedHRefTokenArray = new Vector<TextTokenData>();
_hrefTokenIdData = null;
_attributeTokenMap = new TreeMap<String,TextTokenData>();
_sortedAttributeTokenArray = new Vector<TextTokenData>();
_attributeTokenIdData = null;
_htmlTagList = null;
_htmlAttributeList = null;
_htmlTagMap = null;
_htmlAttributeTagMap = null;
_textTokenLengthsCoder = null;
_textTokenStreamCoder= null;
_hrefTokenLengthsCoder= null;
_hrefTokenStreamCoder= null;
_attributesTokenStreamCoder= null;
_attributesTokenLengthsCoder= null;
// _elementTokenStream = new BitStream();
_elementTokenStream = new ByteArrayOutputStream();
_elementDataStream = new DataOutputStream(_elementTokenStream);
_htmlTagMaskStream = new BitStream();
_htmlAttributeMaskStream = new BitStream();
_encodedTextTokensCount = 0;
_encodedHREFTokensCount = 0;
_encodedAttributeTokensCount = 0;
_encodedElementTokensCount = 0;
_encodedIntegersCount = 0;
_averageElementTagId = 0.0;
_totalElementTags = 0;
_elementStreamMValue = 0;
}
public void encodeHTMLDocument(InstructionsPool instructions,OutputStream finalOutputStream)throws IOException {
preEncodeDocument();
collectTokens(instructions);
postEncodeDocument(instructions, finalOutputStream);
}
private void collectTokens(InstructionsPool instructionsPool)throws IOException {
List<Integer> operations = instructionsPool.operations;
List<String> arguments = instructionsPool.arguments;
for (int i=0; i<operations.size(); i++)
{
int domOperation = operations.get(i);
String domArgument = arguments.get(i);
//System.out.println("Operation :" + ParserInstruction.getOperationString(domOperation)+" Arg:~" + domArgument+"~");
switch (domOperation)
{
// Open node :
case ParserInstruction.AddLeaf:
case ParserInstruction.OpenNode: {
String tagName = domArgument.toLowerCase();
TextTokenData tokenData = _htmlTagMap.get(tagName);
// if tag is well know tag, increase its instance count ...
if (tokenData != null) {
tokenData._tokenInstanceCount++;
}
// otherwsie add the token as a text token
else {
addTextToken(tagName);
}
}break;
// Close node :
case ParserInstruction.CloseNode:
break;
case ParserInstruction.SetTitle:
case ParserInstruction.AddText: {
if (domOperation == ParserInstruction.SetTitle) {
_htmlTagMap.get("title")._tokenInstanceCount++;
}
// get the token's text data ...
String text = domArgument;
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
addTextToken(text);
}
}
break;
case ParserInstruction.AddContent:
break;
case ParserInstruction.WriteAttributeKey: {
String attributeName = domArgument.toLowerCase();
// if a pre-defined attribute name ... increment the associated token's instance count
TextTokenData tokenData = _htmlAttributeTagMap.get(attributeName);
if (tokenData != null) {
tokenData._tokenInstanceCount++;
}
// otherwsie add the token as a text token
else {
addTextToken(attributeName);
}
// advance to value
i++;
operations.get(i);
domArgument = arguments.get(i);
// always add value as a text token for now ...
if (domArgument.length() != 0) {
if (attributeName.equalsIgnoreCase("href") || attributeName.equals("src") || attributeName.equals("url")) {
addHREFTextToken(domArgument);
}
else {
addAttributeTextToken(domArgument);
}
}
}
break;
case ParserInstruction.CloseLeaf:
break;
case ParserInstruction.AddEntity:
break;
case ParserInstruction.AddComment:
break;
}
}
}
private void preEncodeDocument() throws IOException {
// init encoder
initializeEncoder();
// init html tags ...
initializeHTMLTagMaps();
}
private void postEncodeDocument(InstructionsPool instructionsPool,OutputStream finalOutputStream) throws IOException {
// assign ids ...
assignHTMLTokenIds();
_textTokenIdData = assignTextTokenIds(_textTokenMap,_sortedTextTokenArray);
_hrefTokenIdData = assignTextTokenIds(_hrefTokenMap,_sortedHRefTokenArray);
_attributeTokenIdData = assignTextTokenIds(_attributeTokenMap,_sortedAttributeTokenArray);
// create token stream encoder
_textTokenStreamCoder = new RiceCoding(_textTokenIdData._textTokenMValue);
// create text token length encoder
_textTokenLengthsCoder = new RiceCoding(_textTokenIdData._textTokenLenMValue);
// create token stream encoder
_hrefTokenStreamCoder = new RiceCoding(_hrefTokenIdData._textTokenMValue);
// create text token length encoder
_hrefTokenLengthsCoder = new RiceCoding(_hrefTokenIdData._textTokenLenMValue);
// create token stream encoder
_attributesTokenStreamCoder = new RiceCoding(_attributeTokenIdData._textTokenMValue);
// create text token length encoder
_attributesTokenLengthsCoder = new RiceCoding(_attributeTokenIdData._textTokenLenMValue);
// create text buffer
byte[] textData = encodeTextData(_sortedTextTokenArray,_textTokenLengthsCoder);
// create href buffer
byte[] hrefData = encodeTextData(_sortedHRefTokenArray,_hrefTokenLengthsCoder);
// create attributes buffer
byte[] attributesData = encodeTextData(_sortedAttributeTokenArray,_attributesTokenLengthsCoder);
// create combined buffer
byte[] combinedBuffer = new byte[textData.length + hrefData.length + attributesData.length];
System.arraycopy(textData, 0, combinedBuffer, 0, textData.length);
System.arraycopy(hrefData, 0, combinedBuffer, textData.length, hrefData.length);
System.arraycopy(attributesData, 0, combinedBuffer, hrefData.length + textData.length, attributesData.length);
// compress
byte[] compressedTextData = GZIPUtils.zip(textData);
byte[] compressedHRefData = GZIPUtils.zip(hrefData);
byte[] compressedAttributesData = GZIPUtils.zip(attributesData);
byte[] combinedCompressedData = GZIPUtils.zip(combinedBuffer);
// put out stats
System.out.println("Text Buffer Original Size:" + textData.length + " Compressed Size:" + compressedTextData.length);
System.out.println("HREF Buffer Original Size:" + hrefData.length + " Compressed Size:" + compressedHRefData.length);
System.out.println("Attributes Buffer Original Size:" + attributesData.length + " Compressed Size:" + compressedAttributesData.length);
System.out.println("Combined Buffer Original Size:" + combinedBuffer.length + " Compressed Size:" + combinedCompressedData.length);
// stats
System.out.println("Text Length Buffer:" + (_textTokenLengthsCoder.getNumBits() + 7)/8);
System.out.println("HREF Length Buffer:" + (_hrefTokenLengthsCoder.getNumBits() + 7)/8);
System.out.println("Attributes Length Buffer:" + (_attributesTokenLengthsCoder.getNumBits() + 7)/8);
// encode the dom
encodeDOM(instructionsPool);
// some stats ...
System.out.println("Text Token Stream Size:" + (_textTokenStreamCoder.getNumBits() + 7) / 8);
System.out.println("HREF Token Stream Size:" + (_hrefTokenStreamCoder.getNumBits() + 7) / 8);
System.out.println("Element Stream Size:" + _elementTokenStream.size());
System.out.println("Element Token Count:" + _encodedElementTokensCount);
System.out.println("Text Token Instance Count:" + _encodedTextTokensCount);
System.out.println("HREF Token Instance Count:" + _encodedHREFTokensCount);
System.out.println("Integer Token Instance Count:" + _encodedIntegersCount);
byte elementStreamBuffer[] = _elementTokenStream.toByteArray();
byte elementStreamBufferCompressed[] = GZIPUtils.zip(elementStreamBuffer);
System.out.println("Compression of Element Stream Buffer Yielded Buffer of Size:" + elementStreamBufferCompressed.length);
// allocate a 32 bit checksum object
CRC32 checksum = new CRC32();
// allocate a byte stream
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
// checksum the bytes
CheckedOutputStream checkedOutputStream = new CheckedOutputStream(byteStream,checksum);
// and allocate a data output stream
DataOutputStream intermediateDataOutput = new DataOutputStream(checkedOutputStream);
// write out html,href and attribute tag mask streams
WritableUtils.writeVInt(intermediateDataOutput,_htmlTagMaskStream.nbits);
WritableUtils.writeVInt(intermediateDataOutput,_htmlAttributeMaskStream.nbits);
checkedOutputStream.write(_htmlTagMaskStream.bits,0,(_htmlTagMaskStream.nbits + 7) /8);
checkedOutputStream.write(_htmlAttributeMaskStream.bits,0,(_htmlAttributeMaskStream.nbits + 7) /8);
// write out the indivdual text buffer lengths ...
WritableUtils.writeVInt(intermediateDataOutput,textData.length);
WritableUtils.writeVInt(intermediateDataOutput,hrefData.length);
WritableUtils.writeVInt(intermediateDataOutput,attributesData.length);
// write out compressed buffer length
WritableUtils.writeVInt(intermediateDataOutput,combinedCompressedData.length);
// write out the consolidated, compressed text buffer
checkedOutputStream.write(combinedCompressedData);
// write out the lengths streams bitcount...
intermediateDataOutput.writeByte(_textTokenLengthsCoder.getMValue());
WritableUtils.writeVInt(intermediateDataOutput,_textTokenLengthsCoder.getNumBits());
checkedOutputStream.write(_textTokenLengthsCoder.getBits(),0,(_textTokenLengthsCoder.getNumBits() + 7) / 8);
intermediateDataOutput.writeByte(_hrefTokenLengthsCoder.getMValue());
WritableUtils.writeVInt(intermediateDataOutput,_hrefTokenLengthsCoder.getNumBits());
checkedOutputStream.write(_hrefTokenLengthsCoder.getBits(),0,(_hrefTokenLengthsCoder.getNumBits() + 7) / 8);
intermediateDataOutput.writeByte(_attributesTokenLengthsCoder.getMValue());
WritableUtils.writeVInt(intermediateDataOutput,_attributesTokenLengthsCoder.getNumBits());
checkedOutputStream.write(_attributesTokenLengthsCoder.getBits(),0,(_attributesTokenLengthsCoder.getNumBits() + 7) / 8);
// write out token streams lengths
intermediateDataOutput.writeByte(_textTokenStreamCoder.getMValue());
WritableUtils.writeVInt(intermediateDataOutput,_textTokenStreamCoder.getNumBits());
checkedOutputStream.write(_textTokenStreamCoder.getBits(), 0,(_textTokenStreamCoder.getNumBits() + 7) / 8);
intermediateDataOutput.writeByte(_hrefTokenStreamCoder.getMValue());
WritableUtils.writeVInt(intermediateDataOutput,_hrefTokenStreamCoder.getNumBits());
checkedOutputStream.write(_hrefTokenStreamCoder.getBits(),0,(_hrefTokenStreamCoder.getNumBits() + 7) / 8);
intermediateDataOutput.writeByte(_attributesTokenStreamCoder.getMValue());
WritableUtils.writeVInt(intermediateDataOutput,_attributesTokenStreamCoder.getNumBits());
checkedOutputStream.write(_attributesTokenStreamCoder.getBits(),0,(_attributesTokenStreamCoder.getNumBits() + 7) / 8);
// write out element stream ... compressed length, number of bits, and bits
WritableUtils.writeVInt(intermediateDataOutput,elementStreamBufferCompressed.length);
//WritableUtils.writeVInt(intermediateDataOutput,_elementTokenStream.nbits);
checkedOutputStream.write(elementStreamBufferCompressed,0,elementStreamBufferCompressed.length);
System.out.println("Total Compressed Buffer Size:" + byteStream.size());
// now write out final stream ..
DataOutputStream finalStreamDataOut = new DataOutputStream(finalOutputStream);
// write out version
finalStreamDataOut.writeByte(ENCODER_VERSION);
// write out crc
WritableUtils.writeVLong(finalStreamDataOut,checksum.getValue());
// get byte buffer
byte[] intermediateBuffer = byteStream.toByteArray();
// write size
WritableUtils.writeVLong(finalStreamDataOut,intermediateBuffer.length);
// then bits
finalOutputStream.write(intermediateBuffer);
}
private byte[] encodeTextData(Vector<TextTokenData> tokenArray,RiceCoding lengthsCoder) {
StringBuffer buffer = new StringBuffer();
int lastTokenLength = 0;
for (TextTokenData token : tokenArray) {
buffer.append(token._token);
if (lastTokenLength == 0) {
lengthsCoder.addItem(token._token.length() + 1);
}
else {
lengthsCoder.addItem((token._token.length() - lastTokenLength) + 1);
}
lastTokenLength = token._token.length();
}
return buffer.toString().getBytes(Charset.forName("UTF8"));
}
private static double lg(double value) {
return Math.log(value)/Math.log(2.0);
}
private static int bitsToEncode(int value) {
int bitsToEncode = 0;
while (value != 0) {
bitsToEncode++;
value >>= 1;
}
return bitsToEncode;
}
private void encodeInteger(int value, BitStream bitStream,int bitsPerWord) {
int bitsToEncode = bitsToEncode(value);
int bitsCounter = 0;
while (bitsToEncode != 0) {
bitStream.addbit((value & 0x1) != 0 ? 1 : 0);
value >>= 1;
bitsCounter++;
bitsToEncode--;
if (bitsCounter == bitsPerWord) {
bitStream.addbit((bitsToEncode != 0) ? 1: 0);
bitsCounter = 0;
}
}
if (bitsCounter > 0) {
while (bitsCounter++ <= bitsPerWord) {
bitStream.addbit(0);
}
}
}
private void encodeIntegerIntoTokenStream(int integer)throws IOException {
_encodedIntegersCount++;
// encodeInteger(integer,_elementTokenStream,3);
// System.out.println("encoding Integer into token stream:" + integer);
WritableUtils.writeVInt(_elementDataStream, integer);
}
private void encodeElementToken(int tokenId) throws IOException {
_encodedElementTokensCount++;
encodeIntegerIntoTokenStream(tokenId);
}
private void encodeTextTokenId(int tokenId) {
_encodedTextTokensCount++;
_textTokenStreamCoder.addItem(tokenId);
}
private void encodeHRefTokenId(int tokenId) {
_encodedHREFTokensCount++;
_hrefTokenStreamCoder.addItem(tokenId);
}
private void encodeAttributeTokenId(int tokenId) {
_encodedAttributeTokensCount++;
_attributesTokenStreamCoder.addItem(tokenId);
}
private void assignHTMLTokenIds() {
assignTokenIdsToSet(_htmlTagList,_htmlTagMaskStream);
assignTokenIdsToSet(_htmlAttributeList,_htmlAttributeMaskStream);
// recompute average
_averageElementTagId /= (double)_totalElementTags;
_elementStreamMValue = (int) Math.max(0,Math.floor(lg(_averageElementTagId)));
System.out.println("Average Element Tag Id:" + _averageElementTagId + " MValue:" + _elementStreamMValue);
}
private void assignTokenIdsToSet(ImmutableList<TextTokenData> tokenSet,BitStream maskStream) {
int nextTokenId=0;
for (TextTokenData token : tokenSet) {
if (token._tokenInstanceCount != 0) {
token._tokenId = nextTokenId++;
_averageElementTagId += token._tokenId * token._tokenInstanceCount;
_totalElementTags += token._tokenInstanceCount;
maskStream.addbit(1);
// System.out.println("Token:" + token._token + " Id:" + token._tokenId);
}
else {
maskStream.addbit(0);
}
}
// sort set by instance count ...
Vector<TextTokenData> instanceCountSet = new Vector<TextTokenData>();
instanceCountSet.addAll(tokenSet);
Collections.sort(instanceCountSet, new Comparator<TextTokenData>() {
@Override
public int compare(TextTokenData o1, TextTokenData o2) {
return o1._tokenInstanceCount - o2._tokenInstanceCount;
}
});
/*
System.out.println("** tokens by freq **");
for (TextTokenData token : Iterables.reverse(instanceCountSet)) {
if (token._tokenInstanceCount != 0)
System.out.println("Token:" + token._token + " Hits:" + token._tokenInstanceCount);
}
*/
}
private static class TextTokenIdData {
public double _averageTokenId;
public double _averageTokenLengthDelta;
public int _textTokenMValue;
public int _textTokenLenMValue;
}
private TextTokenIdData assignTextTokenIds(TreeMap<String,TextTokenData> tokenMap,Vector<TextTokenData> sortedTokenArray) {
double tokenIdValueTotal = 0.0;
double tokenLengthDeltaTotal = 0.0;
int totalTokenInstanceCount = 0;
// add tokens to list
sortedTokenArray.addAll(tokenMap.values());
// sort them by token lengths...
Collections.sort(sortedTokenArray,new Comparator<TextTokenData>() {
@Override
public int compare(TextTokenData o1, TextTokenData o2) {
return o1._token.length() - o2._token.length();
}
});
// assign token ids
int nextTokenId = 1; // start with a token id of one ...
int lastTokenLength = 0;
int deltaCount = 0;
for (TextTokenData token : sortedTokenArray) {
token._tokenId = nextTokenId++;
tokenIdValueTotal += token._tokenId * token._tokenInstanceCount;
if (lastTokenLength != 0) {
int delta = (token._token.length() - lastTokenLength);
tokenLengthDeltaTotal += delta;
deltaCount++;
}
lastTokenLength = token._token.length();
totalTokenInstanceCount += token._tokenInstanceCount;
//System.out.println("Assigned TokenId:" + token._tokenId + " to Token:" + token._token);
}
TextTokenIdData tokenIdData = new TextTokenIdData();
tokenIdData._averageTokenId = tokenIdValueTotal / (double)totalTokenInstanceCount;
tokenIdData._averageTokenLengthDelta = tokenLengthDeltaTotal / (double)deltaCount;
tokenIdData._textTokenMValue = (int) Math.max(0,Math.floor(lg(tokenIdData._averageTokenId)));
tokenIdData._textTokenLenMValue = (int) Math.max(0,Math.floor(lg(tokenIdData._averageTokenLengthDelta)));
System.out.println("Average Token Id is:" + tokenIdData._averageTokenId + "m for RiceCoding Purposes is:" + tokenIdData._textTokenMValue);
System.out.println("Average Token Length Delta is:" + tokenIdData._averageTokenLengthDelta+ "m for RiceCoding Purposes is:" + tokenIdData._textTokenLenMValue);
return tokenIdData;
}
static class NodeInConstruction {
String nodeName;
ArrayList<Pair<String,String>> attributes = null;
NodeInConstruction(String nodeName) {
this.nodeName = nodeName;
}
void addAttribute(String attributeName,String attributeValue) {
if (attributes == null)
attributes = new ArrayList<Tuples.Pair<String,String>>();
attributes.add(new Pair<String, String>(attributeName,attributeValue));
}
}
private void encodeAttributeInNodeInConstruction(Pair<String,String> attribute) throws IOException {
// get the attribute name ...
String attributeName = attribute.e0;
// check to see if this is a known attribute
TextTokenData tokenData = _htmlAttributeTagMap.get(attributeName);
// if custom attribute name
if (tokenData == null) {
encodeElementToken((attribute.e1 == null || attribute.e1.length() ==0) ? CUSTOM_ATTRIBUTE_START_WO_VALUES : CUSTOM_ATTRIBUTE_START_W_VALUES);
encodeTextTokenId(_textTokenMap.get(attributeName)._tokenId);
}
// normal case .. known attribute name ...
else {
if (tokenData._tokenId == -1) {
throw new RuntimeException("UnNumbered Token:" + tokenData._token + " Encountered!");
}
encodeElementToken((attribute.e1 == null || attribute.e1.length() ==0) ? ATTRIBUTE_START_WO_VALUES : ATTRIBUTE_START_W_VALUES);
encodeIntegerIntoTokenStream(tokenData._tokenId);
}
if (attribute.e1 != null && attribute.e1.length() !=0) {
if (attributeName.equals("href") || attributeName.equals("src") || attributeName.equals("url")) {
encodeHRefTokenId(_hrefTokenMap.get(attribute.e1)._tokenId);
}
else {
encodeAttributeTokenId(_attributeTokenMap.get(attribute.e1)._tokenId);
}
}
}
private void processNodeInConstruction(NodeInConstruction nodeInConstruction) throws IOException {
// get the element name
String elementName = nodeInConstruction.nodeName;
// lookit up in the html tag map
TextTokenData tokenData = _htmlTagMap.get(elementName);
// if not present, this is a custom element
if (tokenData == null) {
encodeElementToken((nodeInConstruction.attributes != null && nodeInConstruction.attributes.size() != 0) ? CUSTOM_ELEMENT_START_W_ATTRIBUTES : CUSTOM_ELEMENT_START_WO_ATTRIBUTES);
encodeTextTokenId(_textTokenMap.get(elementName)._tokenId);
}
// normal case ... known element
else {
if (tokenData._tokenId == -1) {
throw new RuntimeException("UnNumbered Token:" + tokenData._token + " Encountered!");
}
encodeElementToken((nodeInConstruction.attributes != null && nodeInConstruction.attributes.size() != 0) ? ELEMENT_START_W_ATTRIBUTES : ELEMENT_START_WO_ATTRIBUTES);
encodeIntegerIntoTokenStream(tokenData._tokenId);
}
// encode attributes ...
if (nodeInConstruction.attributes != null && nodeInConstruction.attributes.size() != 0) {
encodeIntegerIntoTokenStream(nodeInConstruction.attributes.size());
for (int i = 0; i < nodeInConstruction.attributes.size(); i++) {
encodeAttributeInNodeInConstruction(nodeInConstruction.attributes.get(i));
}
}
}
private void encodeDOM(InstructionsPool instructionsPool) throws IOException {
List<Integer> operations = instructionsPool.operations;
List<String> arguments = instructionsPool.arguments;
NodeInConstruction nodeInConstruction = null;
for (int i=0; i<operations.size(); i++)
{
int domOperation = operations.get(i);
String domArgument = arguments.get(i);
//System.out.println("Operation :" + ParserInstruction.getOperationString(domOperation)+" Arg:~" + domArgument+"~");
switch (domOperation)
{
// Open node :
case ParserInstruction.OpenNode:
case ParserInstruction.AddLeaf: {
if (nodeInConstruction != null) {
processNodeInConstruction(nodeInConstruction);
nodeInConstruction = null;
}
nodeInConstruction = new NodeInConstruction(domArgument.toLowerCase());
}break;
// Close node :
case ParserInstruction.CloseNode:
case ParserInstruction.CloseLeaf: {
if (nodeInConstruction != null) {
processNodeInConstruction(nodeInConstruction);
nodeInConstruction = null;
}
encodeElementToken(ELEMENT_END);
}
break;
case ParserInstruction.AddText: {
if (nodeInConstruction != null) {
processNodeInConstruction(nodeInConstruction);
nodeInConstruction = null;
}
String text = domArgument;
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
encodeElementToken(TEXT_NODE);
encodeTextTokenId(_textTokenMap.get(text)._tokenId);
}
System.out.println("AddText:"+domArgument);
}
break;
case ParserInstruction.AddContent:
if (nodeInConstruction != null) {
processNodeInConstruction(nodeInConstruction);
nodeInConstruction = null;
}
/*
String text = domArgument;
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
encodeElementToken(CDATA_NODE);
encodeTextTokenId(_textTokenMap.get(text)._tokenId);
}
*/
break;
case ParserInstruction.WriteAttributeKey: {
// get the attribute name ...
String attributeName = domArgument.toLowerCase();
// get value
++i;
//
operations.get(i);
String attributeValue = arguments.get(i);
if (nodeInConstruction != null) {
nodeInConstruction.addAttribute(attributeName, attributeValue);
}
}
break;
case ParserInstruction.AddEntity:
if (nodeInConstruction != null) {
processNodeInConstruction(nodeInConstruction);
nodeInConstruction = null;
}
System.out.println("AddEntity:" + domArgument);
break;
case ParserInstruction.AddComment:
if (nodeInConstruction != null) {
processNodeInConstruction(nodeInConstruction);
nodeInConstruction = null;
}
System.out.println("AddComment:" + domArgument);
break;
case ParserInstruction.SetTitle:
if (nodeInConstruction != null) {
processNodeInConstruction(nodeInConstruction);
nodeInConstruction = null;
}
NodeInConstruction node = new NodeInConstruction("title");
processNodeInConstruction(node);
String title = domArgument;
title = title.replaceAll("\\s{2,}"," ");
title = title.trim();
if (title.length() != 0) {
encodeElementToken(TEXT_NODE);
encodeTextTokenId(_textTokenMap.get(title)._tokenId);
}
encodeElementToken(ELEMENT_END);
System.out.println("SetTitle:" + domArgument);
break;
}
}
}
private void encodeNode(Node node) throws IOException {
switch (node.getNodeType()) {
case Node.ATTRIBUTE_NODE: {
// get the attribute name ...
String attributeName = ((Attr)node).getName().toLowerCase();
// check to see if this is a known attribute
TextTokenData tokenData = _htmlAttributeTagMap.get(attributeName);
// if custom attribute name
if (tokenData == null) {
encodeElementToken((node.getNodeValue().length() ==0) ? CUSTOM_ATTRIBUTE_START_WO_VALUES : CUSTOM_ATTRIBUTE_START_W_VALUES);
encodeTextTokenId(_textTokenMap.get(((Attr)node).getName())._tokenId);
}
// normal case .. known attribute name ...
else {
if (tokenData._tokenId == -1) {
throw new RuntimeException("UnNumbered Token:" + tokenData._token + " Encountered!");
}
encodeElementToken((node.getNodeValue().length() ==0) ? ATTRIBUTE_START_WO_VALUES : ATTRIBUTE_START_W_VALUES);
encodeIntegerIntoTokenStream(tokenData._tokenId);
}
if (node.getNodeValue().length() != 0) {
if (attributeName.equals("href") || attributeName.equals("src") || attributeName.equals("url")) {
encodeHRefTokenId(_hrefTokenMap.get(node.getNodeValue())._tokenId);
}
else {
encodeAttributeTokenId(_attributeTokenMap.get(node.getNodeValue())._tokenId);
}
}
/*
if (node.getNodeName().equals("href") || node.getNodeName().equals("src")) {
if (node.getNodeValue().length() != 0) {
tokenSet.add(node.getNodeValue());
}
}
else {
if (node.getNodeValue() != null && node.getNodeValue().length() != 0) {
String tokens[] = node.getNodeValue().split("[ \n\t\r]");
for (String token : tokens) {
if (token.length() != 0) {
tokenSet.add(token);
}
}
}
}
*/
}
break;
case Node.DOCUMENT_NODE:
case Node.DOCUMENT_FRAGMENT_NODE:
case Node.ELEMENT_NODE: {
if (node.getNodeType() == Node.ELEMENT_NODE) {
NamedNodeMap nm = node.getAttributes();
// get the element name
String elementName = ((Element)node).getTagName().toLowerCase();
// lookit up in the html tag map
TextTokenData tokenData = _htmlTagMap.get(elementName);
// if not present, this is a custom element
if (tokenData == null) {
encodeElementToken((nm.getLength() != 0) ? CUSTOM_ELEMENT_START_W_ATTRIBUTES : CUSTOM_ELEMENT_START_WO_ATTRIBUTES);
encodeTextTokenId(_textTokenMap.get((((Element)node).getTagName()))._tokenId);
}
// normal case ... known element
else {
if (tokenData._tokenId == -1) {
throw new RuntimeException("UnNumbered Token:" + tokenData._token + " Encountered!");
}
encodeElementToken((nm.getLength() != 0) ? ELEMENT_START_W_ATTRIBUTES : ELEMENT_START_WO_ATTRIBUTES);
encodeIntegerIntoTokenStream(tokenData._tokenId);
}
if (nm.getLength() != 0) {
encodeIntegerIntoTokenStream(nm.getLength());
for (int i = 0; i < nm.getLength(); i++) {
encodeNode(nm.item(i));
}
}
}
// walk children ...
NodeList list = node.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
encodeNode(list.item(i));
}
if (node.getNodeType() == Node.ELEMENT_NODE) {
// close element
encodeElementToken(ELEMENT_END);
}
}
break;
/*
case Node.CDATA_SECTION_NODE: {
String text = ((CDATASection)node).getData();
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
encodeElementToken(CDATA_NODE);
encodeTextTokenId(_textTokenMap.get(text)._tokenId);
}
}
break;
case Node.COMMENT_NODE: {
String text = ((Comment)node).getData();
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
encodeElementToken(COMMENT_NODE);
encodeTextTokenId(_textTokenMap.get(text)._tokenId);
}
}
break;
*/
case Node.TEXT_NODE: {
String text = ((Text)node).getData();
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
encodeElementToken(TEXT_NODE);
encodeTextTokenId(_textTokenMap.get(text)._tokenId);
}
/*
String parts[] = text.split("[ \n\t\r\\.-/\\(\\);\"\']");
int nonEmptyPartCount = 0;
for (String part : parts) {
if (part.length() != 0)
nonEmptyPartCount++;
}
encodeElementToken(TEXT_NODE);
encodeIntegerIntoTokenStream(nonEmptyPartCount);
for (String part : parts) {
if (part.length() != 0) {
encodeTextTokenId(_tokenMap.get(part)._tokenId);
}
}
*/
}
break;
case Node.ENTITY_REFERENCE_NODE: {
System.out.println("*********Hit Entity Node!!!");
}
break;
default:
// System.out.println(indent + "Unknown node");
break;
}
}
private void addTextToken(String token) {
TextTokenData tokenData = _textTokenMap.get(token);
if (tokenData == null) {
tokenData = new TextTokenData(token);
_textTokenMap.put(token,tokenData);
}
tokenData._tokenInstanceCount++;
}
private void addHREFTextToken(String token) {
TextTokenData tokenData = _hrefTokenMap.get(token);
if (tokenData == null) {
tokenData = new TextTokenData(token);
_hrefTokenMap.put(token,tokenData);
}
tokenData._tokenInstanceCount++;
}
private void addAttributeTextToken(String token) {
TextTokenData tokenData = _attributeTokenMap.get(token);
if (tokenData == null) {
tokenData = new TextTokenData(token);
_attributeTokenMap.put(token,tokenData);
}
tokenData._tokenInstanceCount++;
}
public int collectTokensFromNode(Node node) {
int nodeCount = 1;
switch (node.getNodeType()) {
case Node.ATTRIBUTE_NODE: {
String attributeName = ((Attr)node).getName().toLowerCase();
// if a pre-defined attribute name ... increment the associated token's instance count
TextTokenData tokenData = _htmlAttributeTagMap.get(attributeName);
if (tokenData != null) {
tokenData._tokenInstanceCount++;
}
// otherwsie add the token as a text token
else {
addTextToken(((Attr)node).getName());
}
// always add value as a text token for now ...
if (node.getNodeValue().length() != 0) {
if (attributeName.equals("href") || attributeName.equals("src") || attributeName.equals("url")) {
addHREFTextToken(node.getNodeValue());
}
else {
addAttributeTextToken(node.getNodeValue());
}
}
/*
if (node.getNodeName().equals("href") || node.getNodeName().equals("src")) {
if (node.getNodeValue().length() != 0) {
tokenSet.add(node.getNodeValue());
}
}
else {
if (node.getNodeValue() != null && node.getNodeValue().length() != 0) {
String tokens[] = node.getNodeValue().split("[ \n\t\r]");
for (String token : tokens) {
if (token.length() != 0) {
tokenSet.add(token);
}
}
}
}
*/
}
break;
case Node.DOCUMENT_NODE:
case Node.DOCUMENT_FRAGMENT_NODE:
case Node.ELEMENT_NODE:
if (node.getNodeType() == Node.ELEMENT_NODE) {
String tagName = ((Element)node).getTagName().toLowerCase();
TextTokenData tokenData = _htmlTagMap.get(tagName);
// if tag is well know tag, increase its instance count ...
if (tokenData != null) {
tokenData._tokenInstanceCount++;
}
// otherwsie add the token as a text token
else {
addTextToken(((Element)node).getTagName());
}
}
NamedNodeMap nm = node.getAttributes();
for (int i = 0; i < nm.getLength(); i++)
nodeCount += collectTokensFromNode(nm.item(i));
NodeList list = node.getChildNodes();
for (int i = 0; i < list.getLength(); i++)
nodeCount += collectTokensFromNode(list.item(i));
break;
case Node.CDATA_SECTION_NODE: {
CDATASection section = (CDATASection)node;
// get the token's text data ...
String text = section.getData();
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
//System.out.println("***Skipping CDATA:" + text);
//addTextToken(text);
}
//nodeCount += 1;
}
break;
case Node.COMMENT_NODE: {
// get the token's text data ...
String text = ((Comment)node).getData();
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
//System.out.println("***Skipping COMMENT:" + text);
//addTextToken(text);
}
//nodeCount += 1;
}
break;
case Node.TEXT_NODE: {
// get the token's text data ...
String text = ((Text)node).getData();
text = text.replaceAll("\\s{2,}"," ");
text = text.trim();
if (text.length() != 0) {
addTextToken(text);
}
/*
if (text.length() != 0) {
String parts[] = text.split("[ \n\t\r\\.-/\\(\\);\"\']");
for (String part : parts) {
if (part.length() != 0) {
addTextToken(part);
}
}
}
*/
nodeCount += 1;
}
break;
default:
// System.out.println(indent + "Unknown node");
break;
}
return nodeCount;
}
}
}