/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.parser.server; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.io.OutputStreamWriter; import java.net.URL; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.DataOutputBuffer; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.protocol.shared.HTMLMeta; import org.commoncrawl.protocol.shared.HTMLMetaAttribute; import org.commoncrawl.service.parser.Link; import org.commoncrawl.service.parser.ParseResult; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CharsetUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.HttpHeaderUtils; import org.commoncrawl.util.MimeTypeFilter; import org.commoncrawl.util.HttpHeaderUtils.ContentTypeAndCharset; import org.commoncrawl.util.MimeTypeFilter.MimeTypeDisposition; import org.commoncrawl.util.Tuples.Pair; import org.w3c.dom.Document; import com.dappit.Dapper.parser.DocumentBuilder; import com.dappit.Dapper.parser.InstructionsPool; import com.dappit.Dapper.parser.MozillaParser; import com.dappit.Dapper.parser.ParserInitializationException; import com.dappit.Dapper.parser.ParserInstruction; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.io.ByteProcessor; import com.google.common.io.ByteStreams; import com.google.common.io.InputSupplier; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.internal.Streams; import com.google.gson.stream.JsonWriter; /** * * @author rana * */ public class ParseWorker implements DocumentBuilder { private static final Log LOG = LogFactory.getLog(ParserSlaveServer.class); URL baseURL = null; ImmutableMap<String,String> linkTypeToSrcMap = new ImmutableMap.Builder<String,String>() .put("a", "href") .put("area", "href") .put("frame", "src") .put("iframe", "src") .put("script", "src") .put("link", "href") .put("img", "src").build(); ImmutableSet<String> ignoreTextTagSet = new ImmutableSet.Builder<String>() .add("noscript") .build(); class LinkUnderConstruction { public String linkURL = null; public String type = null; public JsonObject jsonObject = new JsonObject(); public String linkText = ""; public LinkUnderConstruction(String linkType,BlockObjectInContext blockInContext) { type = linkType; jsonObject.addProperty("type", linkType); /* if (blockInContext != null) { JsonObject blockJSONObject = new JsonObject(); blockJSONObject.addProperty("type", blockInContext.type); blockJSONObject.addProperty("oid", blockInContext.id); if (blockInContext.htmlId != null) blockJSONObject.addProperty("id", blockInContext.htmlId); if (blockInContext.classId != null) blockJSONObject.addProperty("class", blockInContext.classId); if (blockInContext.type.equals("table")) { blockJSONObject.addProperty("t_row", Math.max(0,blockInContext.rowNumber)); blockJSONObject.addProperty("t_cell", Math.max(0,blockInContext.cellNumber)); } if (blockInContext.parent != null) { blockJSONObject.addProperty("p_oid", blockInContext.parent.id); // if (blockInContext.parent.htmlId != null) // blockJSONObject.addProperty("p_html_id", blockInContext.parent.htmlId); // if (blockInContext.parent.classId != null) // blockJSONObject.addProperty("p_class", blockInContext.parent.classId); } jsonObject.add("context", blockJSONObject); } */ } public Link buildLink() { if (linkURL != null && linkURL.length() != 0 && !linkURL.startsWith("#")) { try { URL url = new URL(baseURL,linkURL); Link link = new Link(); link.setUrl(url.toString()); jsonObject.addProperty("text", linkText); link.setAttributes(jsonObject.toString()); return link; } catch (MalformedURLException e) { //LOG.error(CCStringUtils.stringifyException(e)); } } return null; } } private ParseResult activeParseResult; public void parsePartialHTMLDocument(ParseResult parseResultOut,URL baseURL,String content)throws IOException { parseResultOut.setParseSuccessful(false); this.baseURL = baseURL; try { String mozillaLibPath = System.getenv().get("MOZILLA_LIB_PATH"); if (mozillaLibPath == null || !new File(mozillaLibPath).isDirectory()) { mozillaLibPath = "/usr/local/lib"; } System.out.println("Mozilla Location:" + mozillaLibPath); // init parser ... MozillaParser.init(null,mozillaLibPath); MozillaParser parser; try { parser = new MozillaParser(this); activeParseResult = parseResultOut; //LOG.info("Parsing Document"); parser.parse(content.getBytes(Charset.forName("UTF-8")),"utf-8",null); activeParseResult = null; // set content type ... parseResultOut.setContentType("text/html"); String finalText = textAccumulator.toString().replaceAll("[ \\t\\x0B\\f]+", " "); while (finalText.indexOf("\n \n") != -1) finalText = finalText.replaceAll("(\\n \\n)+", "\n"); finalText = finalText.replaceAll("[\\n]+", "\n"); parseResultOut.setText(finalText); parseResultOut.setParseSuccessful(true); } catch (ParserInitializationException e) { LOG.error(CCStringUtils.stringifyException(e)); parseResultOut.setParseFailureReason("Parser Initialization Failed!"); } catch (Exception e) { parseResultOut.setParseFailureReason(CCStringUtils.stringifyException(e)); LOG.error(parseResultOut); } } catch (ParserInitializationException e) { parseResultOut.setParseFailureReason("Parser Initialization Failed!"); LOG.error(CCStringUtils.stringifyException(e)); throw new IOException(e); } } public void parseDocument(ParseResult parseResultOut,long domainId,long documentId,URL baseURL,String rawHeaders, FlexBuffer data)throws IOException { parseResultOut.setParseSuccessful(false); this.baseURL = baseURL; if (data.getCount() != 0) { try { String mozillaLibPath = System.getenv().get("MOZILLA_LIB_PATH"); if (mozillaLibPath == null || !new File(mozillaLibPath).isDirectory()) { mozillaLibPath = "/usr/local/lib"; } System.out.println("Mozilla Location:" + mozillaLibPath); // init parser ... MozillaParser.init(null, mozillaLibPath); // load headers ... NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(rawHeaders); // detect content type ... ContentTypeAndCharset contentTypeInfo = new ContentTypeAndCharset(); HttpHeaderUtils.parseContentType(headers, contentTypeInfo); //LOG.info("ContentType:" + contentTypeInfo._contentType + " Charset:" + contentTypeInfo._charset); // ok now extract charset if possible ... Pair<Integer,Charset> charsetTuple = CharsetUtils.bestEffortDetectCharset(rawHeaders,data.get(),data.getOffset(),data.getCount()); if (charsetTuple == null) { charsetTuple = new Pair<Integer,Charset>(CharsetUtils.CHARSET_SRC_NO_MATCH,Charset.forName("ISO-8859-1")); } // decode bytes ... and convert to utf-8 ByteBuffer utf8Bytes = null; try { if (charsetTuple.e1.toString().equalsIgnoreCase("utf-8")) { //LOG.info("Input Charset is utf-8, transposing source bytes to dest bytes"); if (data.getOffset() == 0) { utf8Bytes = ByteBuffer.wrap(data.get(), 0, data.getCount()); } else { byte[] buffer = new byte[data.getCount()]; System.arraycopy(data.get(), data.getOffset(),buffer, 0, data.getCount()); utf8Bytes = ByteBuffer.wrap(buffer); } } else { CharBuffer ucs2Chars = charsetTuple.e1.decode(ByteBuffer.wrap(data.get(),data.getOffset(),data.getCount())); utf8Bytes = Charset.forName("UTF-8").encode(ucs2Chars); } } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); parseResultOut.setParseFailureReason(CCStringUtils.stringifyException(e)); // this should not have happened... we consider this unrecoverable throw new IOException(e); } if (utf8Bytes == null || utf8Bytes.remaining() == 0) { parseResultOut.setParseFailureReason("Invalid UTF-8 bytes detected for doc:" + baseURL + " detector:" + charsetTuple.e0 + " Charset:" + charsetTuple.e1); throw new IOException(parseResultOut.getParseFailureReason()); } //LOG.info("UTF-8 Data Length:" + utf8Bytes.remaining()); MimeTypeDisposition disposition = MimeTypeFilter.checkMimeTypeDisposition(contentTypeInfo._contentType); //LOG.info("MimeType Disposition:"+ disposition); if (disposition == MimeTypeDisposition.ACCEPT_HTML) { // ok ready to send to mozilla ... MozillaParser parser; try { parser = new MozillaParser(this); activeParseResult = parseResultOut; //LOG.info("Parsing Document"); parser.parse(utf8Bytes.array(),"utf-8",null); activeParseResult = null; // set content type ... parseResultOut.setContentType(contentTypeInfo._contentType); String finalText = textAccumulator.toString().replaceAll("[ \\t\\x0B\\f]+", " "); while (finalText.indexOf("\n \n") != -1) finalText = finalText.replaceAll("(\\n \\n)+", "\n"); finalText = finalText.replaceAll("[\\n]+", "\n"); parseResultOut.setText(finalText); parseResultOut.setParseSuccessful(true); } catch (ParserInitializationException e) { LOG.error(CCStringUtils.stringifyException(e)); parseResultOut.setParseFailureReason("Parser Initialization Failed!"); } catch (Exception e) { parseResultOut.setParseFailureReason(CCStringUtils.stringifyException(e)); LOG.error(parseResultOut); } } else if (disposition == MimeTypeDisposition.ACCEPT_OTHER) { } else { parseResultOut.setParseFailureReason("Unsupported ContentType:" + contentTypeInfo._contentType); } } catch (ParserInitializationException e) { parseResultOut.setParseFailureReason("Parser Initialization Failed!"); LOG.error(CCStringUtils.stringifyException(e)); throw new IOException(e); } } } public static void main(String[] args) throws IOException { String baseURL = "http://unknown.com/"; NIOHttpHeaders headers = null; if (args.length != 0) { for (int i=0;i<args.length;++i) { if (args[i].equalsIgnoreCase("--noHeaders")) { headers = new NIOHttpHeaders(); headers.add("content-type", "text/html"); } else if (args[i].equalsIgnoreCase("--baseURL")) { baseURL = args[++i]; } } } URL baseURLObj; try { baseURLObj = new URL(baseURL); } catch (MalformedURLException e2) { LOG.error(CCStringUtils.stringifyException(e2)); throw new IOException("Invalid Base Link"); } final DataOutputBuffer headerBuffer = new DataOutputBuffer(); final DataOutputBuffer contentBuffer = new DataOutputBuffer(); final boolean processHeaders = (headers == null); try { ByteStreams.readBytes( new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return System.in; } } ,new ByteProcessor<Long>() { @Override public Long getResult() { return 0L; } int currLineCharCount = 0; boolean processingHeaders = processHeaders; @Override public boolean processBytes(byte[] buf, int start, int length) throws IOException { if (processingHeaders) { int current = start; int end = current + length; while (processingHeaders && current != end) { if (buf[current] != '\r' && buf[current] != '\n') { currLineCharCount++; } else if (buf[current] == '\n') { if (currLineCharCount == 0){ headerBuffer.write(buf,start,current - start + 1); processingHeaders = false; } currLineCharCount = 0; } current++; } if (processingHeaders) { headerBuffer.write(buf,start,length); } else { length -= current-start; start = current; } } if (!processingHeaders) { contentBuffer.write(buf,start,length); } return true; } }); LOG.info("CONTENT LEN:" + contentBuffer.getLength()); //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8"))); // decode header bytes ... String header = ""; if (headerBuffer.getLength() != 0) { try { header = new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8")); } catch (Exception e) { LOG.warn(CCStringUtils.stringifyException(e)); header = new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("ASCII")); } } else { if (headers != null) { header = headers.toString(); } } LOG.info("HEADER LEN:" + header.length()); System.out.println(header); //LOG.info("Parsing Document"); ParseWorker worker = new ParseWorker(); ParseResult result = new ParseResult(); worker.parseDocument(result,0L,0L,baseURLObj,header,new FlexBuffer(contentBuffer.getData(),0,contentBuffer.getLength())); LOG.info("Parse Result:" + result.getParseSuccessful()); //LOG.info("Parse Data:" + result.toString()); OutputStreamWriter outputWriter = new OutputStreamWriter(System.out, "UTF-8"); JsonElement resultObj = parseResultToJSON(result); JsonWriter writer = new JsonWriter(outputWriter); writer.setIndent(" "); writer.setHtmlSafe(true); writer.setLenient(true); Streams.write(resultObj, writer); writer.flush(); outputWriter.write("******** TEXT OUTPUT **********\n"); outputWriter.write(result.getText()); outputWriter.flush(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } public static final JsonObject parseResultToJSON(ParseResult result) { JsonObject objectOut = new JsonObject(); if (result.isFieldDirty(ParseResult.Field_DOMAINID)) { objectOut.addProperty("domainId",result.getDomainId()); } if (result.isFieldDirty(ParseResult.Field_DOCID)) { objectOut.addProperty("docId",result.getDocId()); } if (result.isFieldDirty(ParseResult.Field_CONTENTTYPE)) { objectOut.addProperty("contentType",result.getContentType().toString()); } if (result.isFieldDirty(ParseResult.Field_CONTEXT)) { objectOut.addProperty("context",result.getContext().toString()); } if (result.isFieldDirty(ParseResult.Field_PARSESUCCESSFUL)) { objectOut.addProperty("parseSuccessful",result.getParseSuccessful()); } if (result.isFieldDirty(ParseResult.Field_PARSEFAILUREREASON)) { objectOut.addProperty("parseFailureReason",result.getParseFailureReason()); } if (result.isFieldDirty(ParseResult.Field_TITLE)) { objectOut.addProperty("title",result.getTitle()); } if (result.getMetaTags().size() != 0) { JsonArray metaTagArray = new JsonArray(); for(int vidx0 = 0; vidx0<result.getMetaTags().size();vidx0++) { JsonObject metaTagJSON = new JsonObject(); HTMLMeta htmlMeta = result.getMetaTags().get(vidx0); for (HTMLMetaAttribute attribute : htmlMeta.getAttributes()) { metaTagJSON.addProperty(attribute.getName(),attribute.getValue()); } metaTagArray.add(metaTagJSON); } objectOut.add("meta_tags", metaTagArray); } if (result.getExtractedLinks().size() != 0) { JsonArray extractedLinksArray = new JsonArray(); for(int vidx0 = 0; vidx0<result.getExtractedLinks().size();vidx0++) { JsonObject extractedLinkJSON = new JsonObject(); Link link = result.getExtractedLinks().get(vidx0); extractedLinkJSON.addProperty("url",link.getUrl()); extractedLinkJSON.addProperty("attributes",link.getAttributes()); extractedLinksArray.add(extractedLinkJSON); } objectOut.add("extracted_links", extractedLinksArray); } if (result.isFieldDirty(ParseResult.Field_TEXT)) { objectOut.addProperty("text",result.getText().toString()); } return objectOut; } int inHeadTag = 0; int inBase = 0; int blockId = 0; int inTable = 0; LinkUnderConstruction activeLink = null; BlockObjectInContext blockInConstruction = null; LinkedList<LinkUnderConstruction> linksUnderConstruction = new LinkedList<LinkUnderConstruction>(); StringBuffer textAccumulator = new StringBuffer(); static class BlockObjectInContext { public BlockObjectInContext parent; public String type = ""; public int id; public int rowNumber=-1; public int cellNumber=-1; public String classId=null; public String htmlId = null; public BlockObjectInContext(BlockObjectInContext parentObject,String type,int id) { this.parent= parentObject; this.type = type; this.id = id; } } static ImmutableSet<String> blockLevelHTMLTags = new ImmutableSet.Builder<String>() .add("address") .add("blockquote") .add("div") .add("dl") .add("fieldset") .add("form") .add("h1") .add("h2") .add("h3") .add("h4") .add("h5") .add("h6") .add("hr") .add("noscript") .add("ol") .add("p") .add("pre") .add("table") .add("ul") .add("dd") .add("dt") .add("li") .add("tbody") .add("td") .add("tfoot") .add("th") .add("thead") .add("tr") .add("button") .add("del") .add("ins") .add("map") .add("object") .add("script") .build(); @Override public Document buildDocument(InstructionsPool instructionsPool,FileOutputStream optionalOutputStream) throws IOException { //LOG.info("Build Document Called"); List<Integer> operations = instructionsPool.operations; List<String> arguments = instructionsPool.arguments; LinkedList<Integer> nodeStack = new LinkedList<Integer>(); LinkedList<BlockObjectInContext> blockStack = new LinkedList<BlockObjectInContext>(); HTMLMeta meta = null; for (int i=0; i<operations.size(); i++) { int domOperation = operations.get(i); String domArgument = arguments.get(i); //System.out.println("Operation :" + ParserInstruction.getOperationString(domOperation)+" Arg:~" + domArgument+"~"); switch (domOperation) { // Open node : case ParserInstruction.OpenNode: case ParserInstruction.AddLeaf: { activeLink = null; blockInConstruction = null; String nodeName = domArgument.toLowerCase(); // append new-line of start of a block level tag ... if (domOperation == ParserInstruction.OpenNode && blockLevelHTMLTags.contains(nodeName)) { if (textAccumulator.length() != 0 && textAccumulator.charAt(textAccumulator.length() -1) != '\n') textAccumulator.append("\n"); } if (nodeName.equals("meta")) { meta = new HTMLMeta(); } else if (linkTypeToSrcMap.containsKey(nodeName)) { //LOG.info("Node:" + nodeName + " is of type Link. Adding to LinksUnderConst"); activeLink = new LinkUnderConstruction(nodeName,blockStack.peek()); linksUnderConstruction.push(activeLink); } else if (nodeName.equals("head")) { inHeadTag++; } else if (nodeName.equals("base")) { if (inHeadTag != 0) { inBase++; } } else if (nodeName.equals("table") || nodeName.equals("div")) { blockInConstruction = new BlockObjectInContext(blockStack.peek(),nodeName,++blockId); blockStack.push(blockInConstruction); } else if (nodeName.equals("tr") || nodeName.equals("th")) { BlockObjectInContext table = blockStack.peek(); if (table != null) { table.rowNumber++; table.cellNumber = -1; } } else if (nodeName.equals("td")) { BlockObjectInContext table = blockStack.peek(); if (table != null) { table.cellNumber++; } } nodeStack.push(i); } break; // Close node : case ParserInstruction.CloseNode: case ParserInstruction.CloseLeaf: { int arguementPos = nodeStack.pop(); String nodeName = arguments.get(arguementPos).toLowerCase(); // append new-line of start of a block level tag ... if (domOperation == ParserInstruction.CloseNode && blockLevelHTMLTags.contains(nodeName)) { if (textAccumulator.length() != 0 && textAccumulator.charAt(textAccumulator.length() -1) != '\n') textAccumulator.append("\n"); } //LOG.info("Close Node Called on Node:" + nodeName); if (nodeName.equals("head")) { inHeadTag--; } else if (nodeName.equals("base")) { if (inHeadTag != 0) { inBase--; } } else if (linkTypeToSrcMap.containsKey(nodeName)){ //LOG.info("Node:" + nodeName + " is a Link Type"); LinkUnderConstruction linkPartial = linksUnderConstruction.pop(); if (linkPartial != null) { //LOG.info("POPed a partial LinkObject of type:" + linkPartial.type); Link link = linkPartial.buildLink(); if (link != null) { activeParseResult.getExtractedLinks().add(link); } } } else if (nodeName.equals("table") || nodeName.equals("div")) { blockStack.pop(); } else if (nodeName.equals("meta")) { if (meta != null) { activeParseResult.getMetaTags().add(meta); meta = null; } } if (textAccumulator.length() != 0 && !Character.isWhitespace(textAccumulator.charAt(textAccumulator.length() - 1))) { textAccumulator.append(" "); } } break; case ParserInstruction.AddText: { Integer arguementPos = nodeStack.peek(); String nodeName = (arguementPos != null) ? arguments.get(arguementPos).toLowerCase() :null; LinkUnderConstruction link = linksUnderConstruction.peek(); if (link != null) { if (link.linkText.length() != 0) link.linkText += " "; link.linkText += domArgument.trim(); } if (nodeName == null || !ignoreTextTagSet.contains(nodeName.toLowerCase())) { textAccumulator.append(domArgument); } }break; // case ParserInstruction.AddContent: // System.out.println("AddContent:"+domArgument); // break; case ParserInstruction.WriteAttributeKey: { // grab key name .. String key = domArgument.toLowerCase(); // and lookahead one to grab attribute value ... i++; if (i < operations.size() && operations.get(i) == ParserInstruction.WriteAttributeValue) { // grab value ... String value = arguments.get(i); // if metatag capture key/value ... if (meta != null) { // create a new attribute object HTMLMetaAttribute attribute = new HTMLMetaAttribute(); attribute.setName(key); attribute.setValue(value); // append to meta tag meta.getAttributes().add(attribute); } else { if(key.equals("href") && inBase != 0) { if (value.length() != 0) { try { baseURL = new URL(value); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); throw new IOException(e); } } } else if (activeLink != null) { if (linkTypeToSrcMap.get(activeLink.type).equalsIgnoreCase(key)) { activeLink.linkURL = value; } else { activeLink.jsonObject.addProperty(key, value); } } else if (blockInConstruction != null){ if (key.equals("class")) { blockInConstruction.classId = value; } else if (key.equals("id")) { blockInConstruction.htmlId = value; } } } } else { // rewind and let outer control block deal with it --i; } } break; case ParserInstruction.SetTitle: { activeParseResult.setTitle(domArgument); } break; // case ParserInstruction.AddEntity: // System.out.println("AddEntity:" + domArgument); // break; // case ParserInstruction.AddComment: // System.out.println("AddComment:" + domArgument); // break; case ParserInstruction.SetTitle: // System.out.println("SetTitle:" + domArgument); // break; // } } } return null; } }