/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.ec2.parser;
import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Counters.Counter;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.service.parser.ParseResult;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.protocol.ParseOutput;
import org.commoncrawl.protocol.shared.CrawlMetadata;
import org.commoncrawl.protocol.shared.FeedAuthor;
import org.commoncrawl.protocol.shared.FeedContent;
import org.commoncrawl.protocol.shared.FeedItem;
import org.commoncrawl.protocol.shared.FeedLink;
import org.commoncrawl.protocol.shared.HTMLContent;
import org.commoncrawl.protocol.shared.HTMLLink;
import org.commoncrawl.protocol.shared.HTMLMeta;
import org.commoncrawl.protocol.shared.HTMLMetaAttribute;
import org.commoncrawl.service.parser.server.ParseWorker;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CharsetUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.GZIPUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.MimeTypeFilter;
import org.commoncrawl.util.SimHash;
import org.commoncrawl.util.TaskDataUtils;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.GZIPUtils.UnzipResult;
import org.commoncrawl.util.MimeTypeFilter.MimeTypeDisposition;
import org.commoncrawl.util.TaskDataUtils.TaskDataClient;
import org.commoncrawl.util.Tuples.Pair;
import org.xml.sax.InputSource;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;
import com.google.gson.stream.JsonReader;
import com.sun.syndication.feed.WireFeed;
import com.sun.syndication.feed.atom.Content;
import com.sun.syndication.feed.atom.Entry;
import com.sun.syndication.feed.atom.Feed;
import com.sun.syndication.feed.rss.Category;
import com.sun.syndication.feed.rss.Channel;
import com.sun.syndication.feed.rss.Description;
import com.sun.syndication.feed.rss.Item;
import com.sun.syndication.io.WireFeedInput;
/**
* Initial version of a Mapper that takes a URL and CrawlURL
* (data structure produced by crawlers) and emits metadata and raw content
* via a custom OutputFormat to S3.
*
* This version only handles HTML,RSS,and ATOM content mainly because we were
* rushed for time to get this job running on EC2 and also because of the
* desire to have a very resilient, tightly controlled codebase to ensure smooth
* and reliable EC2 performance. Needs to be refactored at some point.
*
*
*
* @author rana
*
*/
public class ParserMapper implements Mapper<Text,CrawlURL,Text,ParseOutput> {
public static final Log LOG = LogFactory.getLog(ParserMapper.class);
public static final String JSON_DISPOSITION_PROPERTY = "disposition";
public static final String ORIGINAL_RESPONSE_CODE_HTTP_HEADER = "response";
enum Counters {
BAD_REDIRECT_URL, FAILED_TO_PARSE_HTML, PARSED_HTML_DOC,
FAILED_TO_PARSE_FEED_URL, PARSED_FEED_URL, GUNZIP_FAILED,
GUNZIP_DATA_TRUNCATED, WROTE_METADATA_RECORD, WROTE_TEXT_CONTENT,
WROTE_RAW_CONTENT, GOT_UNHANDLED_IO_EXCEPTION,
GOT_UNHANDLED_RUNTIME_EXCEPTION, MALFORMED_FINAL_URL, GOT_RSS_FEED,
GOT_ATOM_FEED, TRYING_RSS_FEED_PARSER, EXCEPTION_DURING_FEED_PARSE,
FAILED_TO_ID_FEED, FAILED_TO_PARSE_XML_AS_FEED, EXCEPTION_PARSING_LINK_JSON, SKIPPING_ROBOTS_TXT, ERROR_CANONICALIZING_LINK_URL,
PARTIALLY_PROCESSED_SPLIT,
FULLY_PROCESSED_SPLIT, GOT_OUT_OF_MEMORY_ERROR
}
public static final String MAX_MAPPER_RUNTIME_PROPERTY = "cc.max.mapper.runtime";
// 50 minutes per mapper MAX
public static final long DEFAULT_MAX_MAPPER_RUNTIME = 50 * 60 * 1000;
public static final String BAD_TASK_TASKDATA_KEY = "bad";
public static final String GOOD_TASK_TASKDATA_KEY = "good";
private static ImmutableSet<String> dontKeepHeaders = ImmutableSet.of(
"proxy-connection",
"connection",
"keep-alive",
"transfer-encoding",
"te",
"trailer",
"proxy-authorization",
"proxy-authenticate",
"upgrade",
"set-cookie",
"content-encoding"
);
public static JsonObject httpHeadersToJsonObject(NIOHttpHeaders headers)throws IOException {
JsonObject httpHeaderObject = new JsonObject();
// iterate entires in header object
for (int i=0;i<headers.getKeyCount();++i) {
String key = headers.getKey(i);
String value = headers.getValue(i);
if (key == null && i==0) {
httpHeaderObject.addProperty(ORIGINAL_RESPONSE_CODE_HTTP_HEADER, value);
}
else if (key != null && value != null) {
if (!dontKeepHeaders.contains(key.toLowerCase())) {
// and send other ones through
httpHeaderObject.addProperty(key.toLowerCase(),value);
}
}
}
return httpHeaderObject;
}
private Pair<URL,JsonObject> buildRedirectObject(URL originalURL,CrawlURL value,CrawlMetadata metadata,Reporter reporter)throws IOException {
JsonObject redirectObject = new JsonObject();
redirectObject.addProperty("source_url",originalURL.toString());
metadata.getRedirectData().setSourceURL(originalURL.toString());
String canonicalRedirectURL = canonicalizeURL(value.getRedirectURL());
if (canonicalRedirectURL == null) {
reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1);
return null;
}
URL finalURLObj = null;
try {
finalURLObj = new URL(canonicalRedirectURL);
}
catch (MalformedURLException e) {
LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e));
reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1);
return null;
}
redirectObject.addProperty("http_result",(int)value.getOriginalResultCode());
metadata.getRedirectData().setHttpResult(value.getOriginalResultCode());
redirectObject.addProperty("server_ip",IPAddressUtils.IntegerToIPAddressString(value.getOriginalServerIP()));
metadata.getRedirectData().setServerIP(value.getOriginalServerIP());
redirectObject.add("http_headers",httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getOriginalHeaders())));
metadata.getRedirectData().setHttpHeaders(value.getOriginalHeaders());
return new Pair<URL,JsonObject>(finalURLObj,redirectObject);
}
private JsonObject parseResultToJsonObject(URL baseURL,ParseResult result,HTMLContent htmlMeta,Reporter reporter)throws IOException {
JsonParser parser = new JsonParser();
JsonObject objectOut = new JsonObject();
objectOut.addProperty("type","html-doc");
safeSetString(objectOut,"title",result.getTitle());
if (result.isFieldDirty(ParseResult.Field_TITLE))
htmlMeta.setTitle(result.getTitle());
if (result.getMetaTags().size() != 0) {
JsonArray metaArray = new JsonArray();
for (HTMLMeta htmlMetaObject : result.getMetaTags()) {
JsonObject jsonMetaObject = new JsonObject();
// populate meta tag based on attributes
for (HTMLMetaAttribute attribute : htmlMetaObject.getAttributes()) {
jsonMetaObject.addProperty(attribute.getName(),attribute.getValue());
}
metaArray.add(jsonMetaObject);
htmlMeta.getMetaTags().add(htmlMetaObject);
}
objectOut.add("meta_tags", metaArray);
}
if (result.getExtractedLinks().size() != 0) {
JsonArray linkArray = new JsonArray();
for (org.commoncrawl.service.parser.Link link : result.getExtractedLinks()) {
try {
String canonicalLinkURL = canonicalizeURL(link.getUrl());
if (canonicalLinkURL == null) {
reporter.incrCounter(Counters.ERROR_CANONICALIZING_LINK_URL, 1);
}
else {
JsonObject linkObj = parser.parse(new JsonReader(new StringReader(link.getAttributes()))).getAsJsonObject();
linkObj.addProperty("href", canonicalLinkURL);
linkArray.add(linkObj);
HTMLLink linkMeta = new HTMLLink();
linkMeta.setAttributes(link.getAttributes());
linkMeta.setHref(canonicalLinkURL);
htmlMeta.getLinks().add(linkMeta);
}
}
catch (Exception e) {
LOG.error("Error Parsing JSON Link Attributes for Link: " + link.getUrl() + " in Doc:" + baseURL + " Exception:\n" + CCStringUtils.stringifyException(e));
reporter.incrCounter(Counters.EXCEPTION_PARSING_LINK_JSON, 1);
}
}
objectOut.add("links", linkArray);
}
return objectOut;
}
private static String cleanupDescription(Object d) {
String value = null;
if (d instanceof Description)
value = (d != null) ? ((Description)d).getValue() : null;
else if (d instanceof String)
value = (String)d;
else if (d instanceof Content)
value = (d != null) ? ((Content)d).getValue() : null;
if (value == null)
return "";
String[] parts = value.split("<[^>]*>");
StringBuffer buf = new StringBuffer();
for (String part : parts)
buf.append(part);
return buf.toString().trim();
}
private static void safeSetDate(JsonObject jsonObj,String propertyName,Date date) {
if (date != null) {
jsonObj.addProperty(propertyName,date.getTime());
}
}
private static void setRSSCategories(JsonObject jsonObj,List<TextBytes> metaCategories,StringBuffer contentOut,List categories) {
if (categories.size() != 0) {
JsonArray jsonArray = new JsonArray();
for (Object category : categories) {
if (((Category)category).getValue() != null && ((Category)category).getValue().length() != 0) {
safeAppendContentFromString(contentOut,((Category)category).getValue());
jsonArray.add(new JsonPrimitive(((Category)category).getValue()));
if (((Category)category).getValue() != null) {
metaCategories.add(new TextBytes(((Category)category).getValue()));
}
}
}
jsonObj.add("categories", jsonArray);
}
}
private static void setAtomCategories(JsonObject jsonObj,List<TextBytes> metaCategoryList,StringBuffer contentOut,List categories) {
if (categories.size() != 0) {
JsonArray jsonArray = new JsonArray();
for (Object category : categories) {
com.sun.syndication.feed.atom.Category categoryObj = (com.sun.syndication.feed.atom.Category) category;
if (categoryObj.getLabel() != null && categoryObj.getLabel().length() != 0) {
safeAppendContentFromString(contentOut,categoryObj.getLabel());
jsonArray.add(new JsonPrimitive(categoryObj.getLabel()));
if (categoryObj.getLabel() != null)
metaCategoryList.add(new TextBytes(categoryObj.getLabel()));
}
}
jsonObj.add("categories", jsonArray);
}
}
private static void safeSetString(JsonObject jsonObj,String propertyName,String propertyValue) {
if (propertyValue != null && propertyValue.length() != 0) {
jsonObj.addProperty(propertyName,propertyValue);
}
}
private static void safeSetInteger(JsonObject jsonObj,String propertyName,int propertyValue) {
if (propertyValue != -1) {
jsonObj.addProperty(propertyName,propertyValue);
}
}
private Pair<JsonObject,String> parseHTMLDocument(URL baseURL,String rawHeaders, FlexBuffer data,HTMLContent contentMetaOut,Reporter reporter) throws IOException {
ParseResult resultOut = new ParseResult();
ParseWorker parseWorker = new ParseWorker();
parseWorker.parseDocument(resultOut, 0, 0, baseURL, rawHeaders, data);
if (resultOut.getParseSuccessful()) {
return new Pair<JsonObject,String>(parseResultToJsonObject(baseURL, resultOut,contentMetaOut,reporter),resultOut.getText());
}
return null;
}
private Pair<JsonObject,String> parseHTMLSnippet(URL baseURL,String htmlSnippet,HTMLContent contentMetaOut, Reporter reporter) throws IOException {
ParseResult resultOut = new ParseResult();
ParseWorker parseWorker = new ParseWorker();
parseWorker.parsePartialHTMLDocument(resultOut, baseURL, htmlSnippet);
if (resultOut.getParseSuccessful()) {
return new Pair<JsonObject,String>(parseResultToJsonObject(baseURL, resultOut,contentMetaOut,reporter),resultOut.getText());
}
return null;
}
private static String safeAppendContentFromString(StringBuffer buffer,String content) {
if (content != null) {
String contentTrimmed = content.trim();
if (contentTrimmed.length() != 0) {
if (buffer.length() != 0)
buffer.append(" ");
buffer.append(contentTrimmed);
}
return contentTrimmed;
}
return null;
}
private static String safeAppendContentFromContentObj(StringBuffer buffer,Content content) {
if (content != null && content.getValue() != null) {
String contentTrimmed = content.getValue().trim();
if (contentTrimmed.length() != 0) {
if (buffer.length() != 0)
buffer.append(" ");
buffer.append(contentTrimmed);
}
return contentTrimmed;
}
return null;
}
private static void safeAppendLinksFromFeed(JsonObject feedOrItemObj,ImmutableMap<String,String> validLinkTypes,List<FeedLink> feedMetaLinks,List links)throws IOException {
for (Object link : links) {
com.sun.syndication.feed.atom.Link linkObj = (com.sun.syndication.feed.atom.Link) link;
if (linkObj.getHref() != null && linkObj.getRel() != null) {
String canonicalHref = canonicalizeURL(linkObj.getHref());
if (canonicalHref == null) {
LOG.error("Failed to Canoniclize Link URL:" + linkObj.getHref());
}
else {
if (validLinkTypes.keySet().contains(linkObj.getRel())) {
JsonObject jsonLink = new JsonObject();
FeedLink metaLink = new FeedLink();
safeSetString(jsonLink, "type", linkObj.getType());
if (linkObj.getType() != null) metaLink.setType(linkObj.getType());
safeSetString(jsonLink, "href",canonicalHref);
if (linkObj.getHref() != null) metaLink.setHref(canonicalHref);
safeSetString(jsonLink, "rel",linkObj.getRel());
if (linkObj.getRel() != null) metaLink.setRel(linkObj.getRel());
safeSetString(jsonLink, "title", linkObj.getTitle());
if (linkObj.getTitle() != null) metaLink.setTitle(linkObj.getTitle());
feedMetaLinks.add(metaLink);
String linkName = validLinkTypes.get(linkObj.getRel());
JsonElement existing = feedOrItemObj.get(linkName);
if (existing != null) {
JsonArray array = null;
if (!existing.isJsonArray()) {
array = new JsonArray();
array.add(existing);
feedOrItemObj.remove(linkName);
feedOrItemObj.add(linkName, array);
}
else {
array = existing.getAsJsonArray();
}
array.add(jsonLink);
}
else {
feedOrItemObj.add(linkName,jsonLink);
}
}
}
}
}
}
private static void safeAppendAuthorsFromFeed(JsonObject feedOrItemObj,List<FeedAuthor> metaAuthorList,List authors)throws IOException {
if (authors.size() != 0) {
JsonArray authorArray = new JsonArray();
for (Object author : authors) {
com.sun.syndication.feed.atom.Person authorObj = (com.sun.syndication.feed.atom.Person) author;
if (authorObj.getName() != null) {
JsonObject jsonAuthor = new JsonObject();
FeedAuthor metaAuthor = new FeedAuthor();
String canonicalURL = canonicalizeURL(authorObj.getUrl());
safeSetString(jsonAuthor, "name", authorObj.getName());
if (canonicalURL != null) {
safeSetString(jsonAuthor, "url", canonicalURL);
}
if (authorObj.getName() != null) metaAuthor.setName(authorObj.getName());
if (canonicalURL != null) metaAuthor.setUrl(canonicalURL);
authorArray.add(jsonAuthor);
metaAuthorList.add(metaAuthor);
}
}
feedOrItemObj.add("authors", authorArray);
}
}
private static void safeAppendLinkFromString(JsonObject jsonObj,List<FeedLink> metaLinks,String propertyName,String linkValue)throws IOException {
if (linkValue != null && linkValue.length() != 0) {
String canonicalURL = canonicalizeURL(linkValue);
if (canonicalURL != null) {
JsonObject jsonLink = new JsonObject();
FeedLink metaLink = new FeedLink();
jsonLink.addProperty("href",canonicalURL);
metaLink.setHref(canonicalURL);
jsonObj.add(propertyName, jsonLink);
metaLinks.add(metaLink);
}
//TODO REPORT FAILURE
}
}
private Pair<JsonObject,String> rssFeedToJson(URL url,Channel channelObject,FeedContent feedMeta, Reporter reporter )throws IOException {
JsonObject rssObject = new JsonObject();
StringBuffer contentOut = new StringBuffer();
rssObject.addProperty("type","rss-feed");
feedMeta.setType(FeedContent.Type.RSS);
String feedTitle = cleanupDescription(channelObject.getTitle());
rssObject.addProperty("title", safeAppendContentFromString(contentOut,feedTitle));
if (feedTitle != null) feedMeta.setTitle(feedTitle);
safeAppendLinkFromString(rssObject,feedMeta.getLinks(),"link", channelObject.getLink());
String feedDesc = cleanupDescription(channelObject.getDescription());
rssObject.addProperty("description", safeAppendContentFromString(contentOut,feedDesc));
if (feedDesc != null) feedMeta.setDescription(feedDesc);
if (channelObject.getLastBuildDate() != null) {
safeSetDate(rssObject,"updated",channelObject.getLastBuildDate());
feedMeta.setUpdated(channelObject.getLastBuildDate().getTime());
}
else if (channelObject.getPubDate() != null ) {
safeSetDate(rssObject,"updated",channelObject.getPubDate());
feedMeta.setUpdated(channelObject.getPubDate().getTime());
}
setRSSCategories(rssObject,feedMeta.getCategories(),contentOut, channelObject.getCategories());
safeSetString(rssObject, "generator", channelObject.getGenerator());
if (channelObject.getGenerator() != null) feedMeta.setGenerator(channelObject.getGenerator());
safeSetInteger(rssObject, "ttl", channelObject.getTtl());
if (channelObject.getTtl() != -1) feedMeta.setTtl(channelObject.getTtl());
JsonArray itemArray = new JsonArray();
for (Object itemObj : channelObject.getItems()) {
Item item = (Item)itemObj;
JsonObject itemObject = new JsonObject();
FeedItem metaItem = new FeedItem();
String itemTitle =cleanupDescription(item.getTitle());
itemObject.addProperty("title", safeAppendContentFromString(contentOut,itemTitle));
if (itemTitle != null) metaItem.setTitle(itemTitle);
String itemDesc = cleanupDescription(item.getDescription());
itemObject.addProperty("description",safeAppendContentFromString(contentOut,itemDesc));
if (itemDesc != null) metaItem.setDescription(itemDesc);
safeAppendLinkFromString(itemObject,metaItem.getLinks(), "link", item.getLink());
safeSetString(itemObject, "author", item.getAuthor());
if (item.getAuthor() != null) {
FeedAuthor metaAuthor = new FeedAuthor();
metaAuthor.setName(item.getAuthor());
metaItem.getAuthors().add(metaAuthor);
}
setRSSCategories(itemObject,metaItem.getCategories(),contentOut,item.getCategories());
safeSetString(itemObject, "comments", item.getComments());
safeSetDate(itemObject,"published",item.getPubDate());
if (item.getPubDate() != null) metaItem.setPublished(item.getPubDate().getTime());
if (item.getGuid() != null) {
safeSetString(itemObject,"guid",item.getGuid().getValue());
if (item.getGuid().getValue() != null) metaItem.setGuid(item.getGuid().getValue());
}
if (item.getContent() != null && item.getContent().getValue() != null) {
if (item.getContent().getType() == null || item.getContent().getType().contains("html")) {
HTMLContent metaContent = new HTMLContent();
Pair<JsonObject,String> contentTuple = parseHTMLSnippet(url, item.getContent().getValue(),metaContent,reporter);
metaItem.getEmbeddedLinks().addAll(metaContent.getLinks());
if (contentTuple.e0 != null) {
itemObject.add("content", contentTuple.e0);
}
if (contentTuple.e1 != null && contentTuple.e1.length() != 0) {
safeAppendContentFromString(contentOut, contentTuple.e1);
}
}
}
itemArray.add(itemObject);
}
rssObject.add("items",itemArray);
return new Pair<JsonObject,String> (rssObject,contentOut.toString());
}
static ImmutableMap<String, String> validFeedLinks
= new ImmutableMap.Builder<String,String>()
.put("alternate", "link")
.build();
static ImmutableMap<String, String> feedEntryLinks
= new ImmutableMap.Builder<String,String>()
.put("alternate", "link")
.put("self", "self")
.put("replies", "replies")
.build();
private Pair<JsonObject,String> atomFeedToJson(URL url,Feed feedObject,FeedContent feedMeta,Reporter reporter)throws IOException {
JsonObject jsonFeed= new JsonObject();
StringBuffer contentOut = new StringBuffer();
jsonFeed.addProperty("type","atom-feed");
feedMeta.setType(FeedContent.Type.ATOM);
String title = cleanupDescription(feedObject.getTitle());
jsonFeed.addProperty("title", safeAppendContentFromString(contentOut,title));
if (title != null) feedMeta.setTitle(title);
safeAppendLinksFromFeed(jsonFeed, validFeedLinks, feedMeta.getLinks(),feedObject.getAlternateLinks());
safeAppendAuthorsFromFeed(jsonFeed,feedMeta.getAuthors(),feedObject.getAuthors());
if (feedObject.getGenerator() != null) {
safeSetString(jsonFeed, "generator", feedObject.getGenerator().getValue());
if (feedObject.getGenerator().getValue() != null) {
feedMeta.setGenerator(feedObject.getGenerator().getValue());
}
}
safeSetDate(jsonFeed, "updated", feedObject.getUpdated());
if (feedObject.getUpdated() != null) {
feedMeta.setUpdated(feedObject.getUpdated().getTime());
}
setAtomCategories(jsonFeed,feedMeta.getCategories(),contentOut, feedObject.getCategories());
JsonArray itemArray = new JsonArray();
for (Object entry : feedObject.getEntries()) {
Entry entryObj = (Entry)entry;
JsonObject jsonEntry = new JsonObject();
FeedItem metaItem = new FeedItem();
String itemTitle = cleanupDescription(entryObj.getTitle());
jsonEntry.addProperty("title", safeAppendContentFromString(contentOut,itemTitle));
if (itemTitle != null) metaItem.setTitle(itemTitle);
String itemDesc = cleanupDescription(entryObj.getSummary());
jsonEntry.addProperty("description",safeAppendContentFromString(contentOut,itemDesc));
if (itemDesc != null) metaItem.setDescription(itemDesc);
safeSetDate(jsonFeed, "published", entryObj.getPublished());
if (entryObj.getPublished() != null) metaItem.setPublished(entryObj.getPublished().getTime());
safeSetDate(jsonFeed, "updated", entryObj.getUpdated());
if (entryObj.getUpdated() != null) metaItem.setUpdated(entryObj.getUpdated().getTime());
safeAppendLinksFromFeed(jsonEntry, feedEntryLinks,metaItem.getLinks(), entryObj.getAlternateLinks());
safeAppendLinksFromFeed(jsonEntry, feedEntryLinks,metaItem.getLinks(), entryObj.getOtherLinks());
safeAppendAuthorsFromFeed(jsonEntry,metaItem.getAuthors(),entryObj.getAuthors());
setAtomCategories(jsonEntry,metaItem.getCategories(),contentOut,entryObj.getCategories());
for (Object content : entryObj.getContents()) {
com.sun.syndication.feed.atom.Content contentObj = (com.sun.syndication.feed.atom.Content) content;
if (contentObj.getValue() != null && contentObj.getValue().length() != 0) {
if (contentObj.getType() == null || contentObj.getType().contains("html")) {
HTMLContent metaContent = new HTMLContent();
Pair<JsonObject,String> contentTuple = parseHTMLSnippet(url, contentObj.getValue(),metaContent,reporter);
metaItem.getEmbeddedLinks().addAll(metaContent.getLinks());
if (contentTuple.e0 != null) {
if (jsonEntry.has("content")) {
JsonArray array = null;
JsonElement existing = jsonEntry.get("content");
if (!existing.isJsonArray()) {
array = new JsonArray();
array.add(existing);
jsonEntry.remove("content");
jsonEntry.add("content", array);
}
else {
array = existing.getAsJsonArray();
}
array.add(contentTuple.e0);
}
else {
jsonEntry.add("content", contentTuple.e0);
}
if (contentTuple.e1 != null && contentTuple.e1.length() != 0) {
safeAppendContentFromString(contentOut, contentTuple.e1);
}
}
}
}
}
itemArray.add(jsonEntry);
}
jsonFeed.add("items",itemArray);
return new Pair<JsonObject,String>(jsonFeed,contentOut.toString());
}
private static final String feedEntryEnd = "</entry>";
private static final String feedItemEnd = "</item>";
private Pair<JsonObject,String> parseFeedDocument(URL baseURL,String rawHeaders, String feedContent,FeedContent feedMeta,boolean truncatedDocument,Reporter reporter) throws IOException {
if (truncatedDocument) {
LOG.warn("Fixing Up Trancated Doc:" + baseURL);
int indexOfEntryEnd = feedContent.lastIndexOf(feedEntryEnd);
if (indexOfEntryEnd != -1) {
feedContent = feedContent.substring(0,indexOfEntryEnd + feedEntryEnd.length());
feedContent += "</feed>";
}
else {
int indexOfItemEnd = feedContent.lastIndexOf(feedItemEnd);
if (indexOfItemEnd != -1) {
feedContent = feedContent.substring(0,indexOfEntryEnd + feedItemEnd.length());
feedContent += "</channel></rss>";
}
}
}
InputSource source = new InputSource(new StringReader(feedContent));
WireFeedInput input = new WireFeedInput();
Pair<JsonObject,String> resultTuple = null;
try {
WireFeed feed = input.build(source);
if (feed != null) {
if (feed instanceof Channel) {
reporter.incrCounter(Counters.TRYING_RSS_FEED_PARSER, 1);
resultTuple = rssFeedToJson(baseURL,(Channel)feed,feedMeta,reporter);
reporter.incrCounter(Counters.GOT_RSS_FEED, 1);
}
else if (feed instanceof Feed) {
resultTuple = atomFeedToJson(baseURL,(Feed)feed,feedMeta,reporter);
reporter.incrCounter(Counters.GOT_ATOM_FEED, 1);
}
else {
reporter.incrCounter(Counters.FAILED_TO_ID_FEED, 1);
LOG.error("Failed to ID Feed:" + baseURL);
}
}
}
catch (Exception e) {
reporter.incrCounter(Counters.EXCEPTION_DURING_FEED_PARSE, 1);
LOG.error("Failed to parse Feed:" + baseURL + "\n ContentLen:" + feedContent.length() + "\n with Exception:" + CCStringUtils.stringifyException(e));
}
return resultTuple;
}
private Pair<String,Pair<TextBytes,FlexBuffer>> populateContentMetadata(URL finalURL,CrawlURL value,Reporter reporter,JsonObject metadata,CrawlMetadata crawlMeta)throws IOException {
FlexBuffer contentOut = null;
String textOut = null;
NIOHttpHeaders finalHeaders = NIOHttpHeaders
.parseHttpHeaders(value.getHeaders());
CrawlURLMetadata urlMetadata = new CrawlURLMetadata();
// extract information from http headers ...
HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata);
// get the mime type ...
String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE) ? urlMetadata.getContentType() : "text/html";
metadata.addProperty("mime_type", normalizedMimeType);
crawlMeta.setMimeType(normalizedMimeType);
// get download size ...
int downloadSize = value.getContentRaw().getCount();
// set original content len ...
metadata.addProperty("download_size", downloadSize);
crawlMeta.setDownloadSize(downloadSize);
// set truncation flag
if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) {
metadata.addProperty("download_truncated", true);
crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated);
}
if (downloadSize > 0) {
// get content type, charset and encoding
String encoding = finalHeaders.findValue("Content-Encoding");
boolean isGZIP = false;
if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
isGZIP = true;
}
byte[] contentBytes = value.getContentRaw().getReadOnlyBytes();
int contentLen = value.getContentRaw().getCount();
// assume we are going to output original data ...
contentOut = new FlexBuffer(contentBytes,0,contentLen);
if (isGZIP) {
metadata.addProperty("content_is_gzip", isGZIP);
crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP);
UnzipResult unzipResult = null;
try {
// LOG.info("BEFORE GUNZIP");
unzipResult = GZIPUtils.unzipBestEffort(contentBytes,0,contentLen,CrawlEnvironment.GUNZIP_SIZE_LIMIT);
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
if (unzipResult != null && unzipResult.data != null) {
if (unzipResult.wasTruncated) {
LOG.warn("Truncated Document During GZIP:" + finalURL);
reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1);
}
contentBytes = unzipResult.data.get();
contentLen = unzipResult.data.getCount();
metadata.addProperty("gunzip_content_len",unzipResult.data.getCount());
crawlMeta.setGunzipSize(unzipResult.data.getCount());
// update content out ...
contentOut = new FlexBuffer(contentBytes,0,contentLen);
}
else {
metadata.addProperty("gunzip_failed",true);
crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed);
reporter.incrCounter(Counters.GUNZIP_FAILED, 1);
contentBytes = null;
contentLen = 0;
contentOut = null;
}
// LOG.info("AFTER GUNZIP");
}
if (contentBytes != null) {
// ok compute an md5 hash
MD5Hash md5Hash = MD5Hash.digest(contentBytes,0,contentLen);
metadata.addProperty("md5", md5Hash.toString());
crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(),0,md5Hash.getDigest().length));
// get normalized mime type
if (MimeTypeFilter.isTextType(normalizedMimeType)) {
// ok time to decode the data into ucs2 ...
Pair<Pair<Integer,Charset>,String> decodeResult= CharsetUtils.bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen);
// ok write out decode metadata
metadata.addProperty("charset_detected", decodeResult.e0.e1.toString());
crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString());
metadata.addProperty("charset_detector", decodeResult.e0.e0);
crawlMeta.setCharsetDetector(decodeResult.e0.e0);
// add appropriate http header (for detected charset)
finalHeaders.add(Constants.ARCFileHeader_DetectedCharset, decodeResult.e0.e1.toString());
// get the content
String textContent = decodeResult.e1;
// compute simhash
long simhash = SimHash.computeOptimizedSimHashForString(textContent);
metadata.addProperty("text_simhash", simhash);
crawlMeta.setTextSimHash(simhash);
// figure out simplified mime type ...
MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter.checkMimeTypeDisposition(normalizedMimeType);
boolean parseComplete = false;
Pair<JsonObject,String> tupleOut = null;
// write it out
if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) {
//LOG.info("Parsing:" + finalURL.toString() + " Headers:" + value.getHeaders() + " ContentLen:" + contentLen);
// ok parse as html
tupleOut = parseHTMLDocument(finalURL,value.getHeaders(),new FlexBuffer(contentBytes,0,contentLen),crawlMeta.getHtmlContent(),reporter);
if (tupleOut == null) {
reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1);
LOG.error("Unable to Parse as HTML:" + finalURL.toString());
mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
}
else {
reporter.incrCounter(Counters.PARSED_HTML_DOC, 1);
metadata.addProperty("parsed_as", "html");
crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
parseComplete = true;
}
}
if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) {
// ok try parse this document as a feed ...
tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent,crawlMeta.getFeedContent(),((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0),reporter);
if (tupleOut == null) {
if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) {
reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1);
//TODO:HACK
//LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString());
mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
}
}
else {
reporter.incrCounter(Counters.PARSED_FEED_URL, 1);
metadata.addProperty("parsed_as", "feed");
crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
parseComplete = true;
}
}
if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) {
reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1);
mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
}
if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) {
// LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text");
// TODO: FIX THIS BUT PUNT FOR NOW :-(
//tupleOut = new Pair<JsonObject,String>(null,textContent);
}
if (tupleOut != null) {
if (tupleOut.e0 != null) {
metadata.add("content", tupleOut.e0);
}
textOut = tupleOut.e1;
}
}
}
}
return new Pair<String,Pair<TextBytes,FlexBuffer>>(textOut,new Pair<TextBytes,FlexBuffer>(new TextBytes(finalHeaders.toString()),contentOut));
}
static void safeSetJsonPropertyFromJsonProperty(JsonObject destinationObj,String destinationProperty,JsonElement sourceObj,String sourceProperty)throws IOException {
if (sourceObj != null && sourceObj.isJsonObject()) {
JsonElement sourceElement = sourceObj.getAsJsonObject().get(sourceProperty);
if (sourceElement != null) {
destinationObj.add(destinationProperty, sourceElement);
}
}
}
private static String canonicalizeURL(String sourceURL) throws IOException {
if (sourceURL != null) {
GoogleURL urlObject = new GoogleURL(sourceURL);
return URLUtils.canonicalizeURL(urlObject, false);
}
return null;
}
int mapCalls = 0;
@Override
public void map(Text sourceURL, CrawlURL value, OutputCollector<Text, ParseOutput> output,Reporter reporter) throws IOException {
// if not marked for early termination .. check if we reached that condition ...
if (System.currentTimeMillis() > _killTime) {
LOG.error("Expended Max Allowed Time for Mapper! Progress was at:" + _lastProgressValue);
// bail from the task ...
_terminatedEarly = true;
}
// if terminated early, just skip processing
if (_terminatedEarly) {
LOG.error("Mapper Already Terminated Early!");
return;
}
// ok we are still good to go ...
else {
// OK, disable this whole code path since we turned off speculative execution for now ...
// every 10 map calls ... check with tdc to see if we should fast fail this mapper ...
if (++mapCalls % 10 == 0) {
String badTaskDataValue = _taskDataClient.queryTaskData(BAD_TASK_TASKDATA_KEY);
if (badTaskDataValue != null && badTaskDataValue.length() != 0) {
throw new IOException("Fast Failing Blacklisted (by TDC) Mapper");
}
}
}
if (sourceURL.getLength() == 0) {
LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL());
return;
}
try {
// allocate parse output
ParseOutput parseOutput = new ParseOutput();
// initialize segment id in output upfront ...
parseOutput.setDestSegmentId(_segmentId);
// json object out ...
JsonObject jsonObj = new JsonObject();
// and create a crawl metadata
CrawlMetadata metadata = parseOutput.getCrawlMetadata();
// and content (if available) ...
Pair<String,Pair<TextBytes,FlexBuffer>> contentOut = null;
// canonicalize the url (minimally)
String canonicalURL = canonicalizeURL(sourceURL.toString());
// if canonicalization failed... bail early
if (canonicalURL == null) {
reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1);
return;
}
URL originalURL = null;
try {
originalURL = new URL(canonicalURL);
}
catch (MalformedURLException e) {
LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e));
reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1);
return;
}
if (originalURL.getPath().endsWith("/robots.txt")) {
reporter.incrCounter(Counters.SKIPPING_ROBOTS_TXT, 1);
return;
}
URL finalURL = originalURL;
jsonObj.addProperty("attempt_time",value.getLastAttemptTime());
metadata.setAttemptTime(value.getLastAttemptTime());
// first step write status
jsonObj.addProperty("disposition",
(value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS)
? "SUCCESS" : "FAILURE");
metadata.setCrawlDisposition((byte)
((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1));
// deal with redirects ...
if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
Pair<URL,JsonObject> redirect = buildRedirectObject(originalURL,value,metadata,reporter);
if (redirect == null) {
return;
}
jsonObj.add("redirect_from",redirect.e1);
finalURL = redirect.e0;
}
if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) {
jsonObj.addProperty("failure_reason",CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason()));
metadata.setFailureReason(value.getLastAttemptFailureReason());
jsonObj.addProperty("failure_detail",value.getLastAttemptFailureDetail());
metadata.setFailureDetail(value.getLastAttemptFailureDetail());
}
else {
jsonObj.addProperty("server_ip",IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
metadata.setServerIP(value.getServerIP());
jsonObj.addProperty("http_result",value.getResultCode());
metadata.setHttpResult(value.getResultCode());
jsonObj.add("http_headers",httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders())));
metadata.setHttpHeaders(value.getHeaders());
jsonObj.addProperty("content_len",value.getContentRaw().getCount());
metadata.setContentLength(value.getContentRaw().getCount());
if (value.getResultCode() >= 200 && value.getResultCode() <= 299 && value.getContentRaw().getCount() > 0) {
contentOut = populateContentMetadata(finalURL,value,reporter,jsonObj,metadata);
if (metadata.isFieldDirty(CrawlMetadata.Field_CHARSETDETECTED)) {
parseOutput.setDetectedCharset(metadata.getCharsetDetected());
}
}
}
// ok ... write stuff out ...
reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1);
//////////////////////////////////////////////////////////////
// echo some stuff to parseOutput ...
parseOutput.setMetadata(jsonObj.toString());
JsonElement mimeType = jsonObj.get("mime_type");
if (mimeType != null) {
parseOutput.setNormalizedMimeType(mimeType.getAsString());
}
JsonElement md5 = jsonObj.get("md5");
if (md5 != null) {
MD5Hash hash = new MD5Hash(md5.getAsString());
byte[] bytes = hash.getDigest();
parseOutput.setMd5Hash(new FlexBuffer(bytes,0,bytes.length));
}
JsonElement simHash = jsonObj.get("text_simhash");
if (simHash != null) {
parseOutput.setSimHash(simHash.getAsLong());
}
parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
parseOutput.setFetchTime(value.getLastAttemptTime());
////////////////////////////////////////////////////////////
if (contentOut != null) {
if (contentOut.e0 != null) {
parseOutput.setTextContent(contentOut.e0);
reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1);
}
if (contentOut.e1 != null) {
// directly set the text bytes ...
parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0);
// mark it dirty !!!
parseOutput.setFieldDirty(ParseOutput.Field_HEADERS);
// if content available ...
if (contentOut.e1.e1 != null) {
parseOutput.setRawContent(contentOut.e1.e1);
}
reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1);
}
}
//buildCompactMetadata(parseOutput,jsonObj,urlMap);
output.collect(new Text(finalURL.toString()), parseOutput);
}
catch (Exception e) {
LOG.error("Exception Processing URL:" + sourceURL.toString() + "\n" + CCStringUtils.stringifyException(e));
if (e instanceof IOException)
reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1);
else
reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1);
}
catch (OutOfMemoryError e) {
LOG.fatal("Got Out of Memory Error Processing URL:" + sourceURL.toString() + "\n" + CCStringUtils.stringifyException(e));
reporter.incrCounter(Counters.GOT_OUT_OF_MEMORY_ERROR, 1);
// bail from the remainder of the map task
_terminatedEarly = true;
}
}
/**
* inform tdc of successful task completion ...
* @throws IOException
*/
public void commitTask(Reporter reporter) throws IOException {
if (_terminatedEarly) {
OutputCommitter.setTaskDataCommitInfo(BAD_TASK_TASKDATA_KEY, getRemainingSplitInfo());
// _taskDataClient.updateTaskData(BAD_TASK_TASKDATA_KEY, getRemainingSplitInfo());
reporter.incrCounter(Counters.PARTIALLY_PROCESSED_SPLIT, 1);
}
else {
OutputCommitter.setTaskDataCommitInfo(GOOD_TASK_TASKDATA_KEY, getOriginalSplitInfo());
// _taskDataClient.updateTaskData(GOOD_TASK_TASKDATA_KEY, getOriginalSplitInfo());
reporter.incrCounter(Counters.FULLY_PROCESSED_SPLIT, 1);
}
}
public void updateProgressAndPosition(double progress,long position) {
_lastProgressValue = progress;
_lastPosition = position;
}
double _lastProgressValue;
long _lastPosition = 0L;
long _segmentId;
long _startTime;
long _killTime;
long _maxRunTime;
boolean _terminatedEarly = false;
TaskDataClient _taskDataClient;
String _splitFile;
long _splitStartPos;
long _splitLength;
/**
*
* @return true if this mapper exited early due to a timeout ...
*/
boolean wasTerminatedEarly() {
return _terminatedEarly;
}
/**
*
* @return the orignal split information
*/
String getOriginalSplitInfo() {
return _splitFile + ":" +_splitStartPos + "+" + _splitLength;
}
/**
*
* @return the unprocessed portion of the split (after early termination)
*/
String getRemainingSplitInfo() {
// calculate remaining split length ...
long splitRemaining = _splitLength - (_lastPosition - _splitStartPos);
// and return a split for the remaining (unprocessed) portion of the split ...
return _splitFile + ":" + _lastPosition + "+" + splitRemaining + "," + getOriginalSplitInfo();
}
@Override
public void configure(JobConf job) {
LOG.info("LIBRARY PATH:" + System.getenv().get("LD_LIBRARY_PATH"));
_segmentId = job.getLong("cc.segmet.id", -1L);
LOG.info("Job Conf says Segment Id is:" + _segmentId);
_startTime = System.currentTimeMillis();
_maxRunTime = job.getLong(MAX_MAPPER_RUNTIME_PROPERTY, DEFAULT_MAX_MAPPER_RUNTIME);
LOG.info("Job Max Runtime (per config) is:" + _maxRunTime);
_killTime = _startTime + _maxRunTime;
// initialize the Task Data Client ...
try {
_taskDataClient = TaskDataUtils.getTaskDataClientForTask(job);
} catch (IOException e) {
LOG.fatal("Unable to Initialize Task Data Client with Error:" + CCStringUtils.stringifyException(e));
// hard fail
throw new RuntimeException("Unable to Initialize Task Data Client with Error:" + CCStringUtils.stringifyException(e));
}
_splitFile = job.get("map.input.file");
_splitStartPos = job.getLong("map.input.start",-1);
_splitLength = job.getLong("map.input.length",-1);
}
@Override
public void close() throws IOException {
_taskDataClient.shutdown();
}
private static class MockReporter implements Reporter {
@Override
public Counter getCounter(Enum<?> name) {
return null;
}
@Override
public Counter getCounter(String group, String name) {
return null;
}
@Override
public InputSplit getInputSplit() throws UnsupportedOperationException {
return null;
}
@Override
public void incrCounter(Enum<?> key, long amount) {}
@Override
public void incrCounter(String group, String counter, long amount) {}
@Override
public void setStatus(String status) {}
@Override
public void progress() {}
//@Override
public float getProgress() { return 0; }
}
/**
* some test code ...
*
* @param args
* @throws IOException
*/
public static void main(String[] args)throws IOException {
Configuration conf = new Configuration();
Path pathToCrawlLog = new Path(args[0]);
FileSystem fs = FileSystem.get(pathToCrawlLog.toUri(),conf);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, pathToCrawlLog, conf);
Text url = new Text();
CrawlURL urlData = new CrawlURL();
ParserMapper mapper = new ParserMapper();
MockReporter reporter = new MockReporter();
final JsonParser parser = new JsonParser();
while (reader.next(url, urlData)) {
mapper.map(url, urlData,
new OutputCollector<Text, ParseOutput>() {
@Override
public void collect(Text key, ParseOutput value) throws IOException {
long timeStart = System.currentTimeMillis();
JsonObject metadata = parser.parse(new JsonReader(new StringReader(value.getMetadata()))).getAsJsonObject();
long timeEnd = System.currentTimeMillis();
System.out.println("Key:" + key.toString() + " Parse Took:" + (timeEnd-timeStart));
System.out.println("Key:" + key.toString() + " Metadata Size:" + value.getMetadataAsTextBytes().getLength());
System.out.println("Key:" + key.toString() + " Text-Size" + value.getTextContentAsTextBytes().getLength());
System.out.println("Key:" + key.toString() + " RAW-Size" + value.getRawContent().getCount());
System.out.println("Key:" + key.toString() + " Metadata:");
System.out.println(JSONUtils.prettyPrintJSON(metadata));
System.out.println("Key:" + key.toString() + " Text:");
System.out.println(value.getTextContent());
}
}, reporter);
}
reader.close();
}
}