/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.ec2.parser; import java.io.IOException; import java.io.StringReader; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.Date; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.Counters.Counter; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.crawl.common.shared.Constants; import org.commoncrawl.service.parser.ParseResult; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.protocol.ParseOutput; import org.commoncrawl.protocol.shared.CrawlMetadata; import org.commoncrawl.protocol.shared.FeedAuthor; import org.commoncrawl.protocol.shared.FeedContent; import org.commoncrawl.protocol.shared.FeedItem; import org.commoncrawl.protocol.shared.FeedLink; import org.commoncrawl.protocol.shared.HTMLContent; import org.commoncrawl.protocol.shared.HTMLLink; import org.commoncrawl.protocol.shared.HTMLMeta; import org.commoncrawl.protocol.shared.HTMLMetaAttribute; import org.commoncrawl.service.parser.server.ParseWorker; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CharsetUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.GZIPUtils; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.HttpHeaderInfoExtractor; import org.commoncrawl.util.IPAddressUtils; import org.commoncrawl.util.JSONUtils; import org.commoncrawl.util.MimeTypeFilter; import org.commoncrawl.util.SimHash; import org.commoncrawl.util.TaskDataUtils; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.GZIPUtils.UnzipResult; import org.commoncrawl.util.MimeTypeFilter.MimeTypeDisposition; import org.commoncrawl.util.TaskDataUtils.TaskDataClient; import org.commoncrawl.util.Tuples.Pair; import org.xml.sax.InputSource; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import com.google.gson.JsonPrimitive; import com.google.gson.stream.JsonReader; import com.sun.syndication.feed.WireFeed; import com.sun.syndication.feed.atom.Content; import com.sun.syndication.feed.atom.Entry; import com.sun.syndication.feed.atom.Feed; import com.sun.syndication.feed.rss.Category; import com.sun.syndication.feed.rss.Channel; import com.sun.syndication.feed.rss.Description; import com.sun.syndication.feed.rss.Item; import com.sun.syndication.io.WireFeedInput; /** * Initial version of a Mapper that takes a URL and CrawlURL * (data structure produced by crawlers) and emits metadata and raw content * via a custom OutputFormat to S3. * * This version only handles HTML,RSS,and ATOM content mainly because we were * rushed for time to get this job running on EC2 and also because of the * desire to have a very resilient, tightly controlled codebase to ensure smooth * and reliable EC2 performance. Needs to be refactored at some point. * * * * @author rana * */ public class ParserMapper implements Mapper<Text,CrawlURL,Text,ParseOutput> { public static final Log LOG = LogFactory.getLog(ParserMapper.class); public static final String JSON_DISPOSITION_PROPERTY = "disposition"; public static final String ORIGINAL_RESPONSE_CODE_HTTP_HEADER = "response"; enum Counters { BAD_REDIRECT_URL, FAILED_TO_PARSE_HTML, PARSED_HTML_DOC, FAILED_TO_PARSE_FEED_URL, PARSED_FEED_URL, GUNZIP_FAILED, GUNZIP_DATA_TRUNCATED, WROTE_METADATA_RECORD, WROTE_TEXT_CONTENT, WROTE_RAW_CONTENT, GOT_UNHANDLED_IO_EXCEPTION, GOT_UNHANDLED_RUNTIME_EXCEPTION, MALFORMED_FINAL_URL, GOT_RSS_FEED, GOT_ATOM_FEED, TRYING_RSS_FEED_PARSER, EXCEPTION_DURING_FEED_PARSE, FAILED_TO_ID_FEED, FAILED_TO_PARSE_XML_AS_FEED, EXCEPTION_PARSING_LINK_JSON, SKIPPING_ROBOTS_TXT, ERROR_CANONICALIZING_LINK_URL, PARTIALLY_PROCESSED_SPLIT, FULLY_PROCESSED_SPLIT, GOT_OUT_OF_MEMORY_ERROR } public static final String MAX_MAPPER_RUNTIME_PROPERTY = "cc.max.mapper.runtime"; // 50 minutes per mapper MAX public static final long DEFAULT_MAX_MAPPER_RUNTIME = 50 * 60 * 1000; public static final String BAD_TASK_TASKDATA_KEY = "bad"; public static final String GOOD_TASK_TASKDATA_KEY = "good"; private static ImmutableSet<String> dontKeepHeaders = ImmutableSet.of( "proxy-connection", "connection", "keep-alive", "transfer-encoding", "te", "trailer", "proxy-authorization", "proxy-authenticate", "upgrade", "set-cookie", "content-encoding" ); public static JsonObject httpHeadersToJsonObject(NIOHttpHeaders headers)throws IOException { JsonObject httpHeaderObject = new JsonObject(); // iterate entires in header object for (int i=0;i<headers.getKeyCount();++i) { String key = headers.getKey(i); String value = headers.getValue(i); if (key == null && i==0) { httpHeaderObject.addProperty(ORIGINAL_RESPONSE_CODE_HTTP_HEADER, value); } else if (key != null && value != null) { if (!dontKeepHeaders.contains(key.toLowerCase())) { // and send other ones through httpHeaderObject.addProperty(key.toLowerCase(),value); } } } return httpHeaderObject; } private Pair<URL,JsonObject> buildRedirectObject(URL originalURL,CrawlURL value,CrawlMetadata metadata,Reporter reporter)throws IOException { JsonObject redirectObject = new JsonObject(); redirectObject.addProperty("source_url",originalURL.toString()); metadata.getRedirectData().setSourceURL(originalURL.toString()); String canonicalRedirectURL = canonicalizeURL(value.getRedirectURL()); if (canonicalRedirectURL == null) { reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1); return null; } URL finalURLObj = null; try { finalURLObj = new URL(canonicalRedirectURL); } catch (MalformedURLException e) { LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1); return null; } redirectObject.addProperty("http_result",(int)value.getOriginalResultCode()); metadata.getRedirectData().setHttpResult(value.getOriginalResultCode()); redirectObject.addProperty("server_ip",IPAddressUtils.IntegerToIPAddressString(value.getOriginalServerIP())); metadata.getRedirectData().setServerIP(value.getOriginalServerIP()); redirectObject.add("http_headers",httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getOriginalHeaders()))); metadata.getRedirectData().setHttpHeaders(value.getOriginalHeaders()); return new Pair<URL,JsonObject>(finalURLObj,redirectObject); } private JsonObject parseResultToJsonObject(URL baseURL,ParseResult result,HTMLContent htmlMeta,Reporter reporter)throws IOException { JsonParser parser = new JsonParser(); JsonObject objectOut = new JsonObject(); objectOut.addProperty("type","html-doc"); safeSetString(objectOut,"title",result.getTitle()); if (result.isFieldDirty(ParseResult.Field_TITLE)) htmlMeta.setTitle(result.getTitle()); if (result.getMetaTags().size() != 0) { JsonArray metaArray = new JsonArray(); for (HTMLMeta htmlMetaObject : result.getMetaTags()) { JsonObject jsonMetaObject = new JsonObject(); // populate meta tag based on attributes for (HTMLMetaAttribute attribute : htmlMetaObject.getAttributes()) { jsonMetaObject.addProperty(attribute.getName(),attribute.getValue()); } metaArray.add(jsonMetaObject); htmlMeta.getMetaTags().add(htmlMetaObject); } objectOut.add("meta_tags", metaArray); } if (result.getExtractedLinks().size() != 0) { JsonArray linkArray = new JsonArray(); for (org.commoncrawl.service.parser.Link link : result.getExtractedLinks()) { try { String canonicalLinkURL = canonicalizeURL(link.getUrl()); if (canonicalLinkURL == null) { reporter.incrCounter(Counters.ERROR_CANONICALIZING_LINK_URL, 1); } else { JsonObject linkObj = parser.parse(new JsonReader(new StringReader(link.getAttributes()))).getAsJsonObject(); linkObj.addProperty("href", canonicalLinkURL); linkArray.add(linkObj); HTMLLink linkMeta = new HTMLLink(); linkMeta.setAttributes(link.getAttributes()); linkMeta.setHref(canonicalLinkURL); htmlMeta.getLinks().add(linkMeta); } } catch (Exception e) { LOG.error("Error Parsing JSON Link Attributes for Link: " + link.getUrl() + " in Doc:" + baseURL + " Exception:\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.EXCEPTION_PARSING_LINK_JSON, 1); } } objectOut.add("links", linkArray); } return objectOut; } private static String cleanupDescription(Object d) { String value = null; if (d instanceof Description) value = (d != null) ? ((Description)d).getValue() : null; else if (d instanceof String) value = (String)d; else if (d instanceof Content) value = (d != null) ? ((Content)d).getValue() : null; if (value == null) return ""; String[] parts = value.split("<[^>]*>"); StringBuffer buf = new StringBuffer(); for (String part : parts) buf.append(part); return buf.toString().trim(); } private static void safeSetDate(JsonObject jsonObj,String propertyName,Date date) { if (date != null) { jsonObj.addProperty(propertyName,date.getTime()); } } private static void setRSSCategories(JsonObject jsonObj,List<TextBytes> metaCategories,StringBuffer contentOut,List categories) { if (categories.size() != 0) { JsonArray jsonArray = new JsonArray(); for (Object category : categories) { if (((Category)category).getValue() != null && ((Category)category).getValue().length() != 0) { safeAppendContentFromString(contentOut,((Category)category).getValue()); jsonArray.add(new JsonPrimitive(((Category)category).getValue())); if (((Category)category).getValue() != null) { metaCategories.add(new TextBytes(((Category)category).getValue())); } } } jsonObj.add("categories", jsonArray); } } private static void setAtomCategories(JsonObject jsonObj,List<TextBytes> metaCategoryList,StringBuffer contentOut,List categories) { if (categories.size() != 0) { JsonArray jsonArray = new JsonArray(); for (Object category : categories) { com.sun.syndication.feed.atom.Category categoryObj = (com.sun.syndication.feed.atom.Category) category; if (categoryObj.getLabel() != null && categoryObj.getLabel().length() != 0) { safeAppendContentFromString(contentOut,categoryObj.getLabel()); jsonArray.add(new JsonPrimitive(categoryObj.getLabel())); if (categoryObj.getLabel() != null) metaCategoryList.add(new TextBytes(categoryObj.getLabel())); } } jsonObj.add("categories", jsonArray); } } private static void safeSetString(JsonObject jsonObj,String propertyName,String propertyValue) { if (propertyValue != null && propertyValue.length() != 0) { jsonObj.addProperty(propertyName,propertyValue); } } private static void safeSetInteger(JsonObject jsonObj,String propertyName,int propertyValue) { if (propertyValue != -1) { jsonObj.addProperty(propertyName,propertyValue); } } private Pair<JsonObject,String> parseHTMLDocument(URL baseURL,String rawHeaders, FlexBuffer data,HTMLContent contentMetaOut,Reporter reporter) throws IOException { ParseResult resultOut = new ParseResult(); ParseWorker parseWorker = new ParseWorker(); parseWorker.parseDocument(resultOut, 0, 0, baseURL, rawHeaders, data); if (resultOut.getParseSuccessful()) { return new Pair<JsonObject,String>(parseResultToJsonObject(baseURL, resultOut,contentMetaOut,reporter),resultOut.getText()); } return null; } private Pair<JsonObject,String> parseHTMLSnippet(URL baseURL,String htmlSnippet,HTMLContent contentMetaOut, Reporter reporter) throws IOException { ParseResult resultOut = new ParseResult(); ParseWorker parseWorker = new ParseWorker(); parseWorker.parsePartialHTMLDocument(resultOut, baseURL, htmlSnippet); if (resultOut.getParseSuccessful()) { return new Pair<JsonObject,String>(parseResultToJsonObject(baseURL, resultOut,contentMetaOut,reporter),resultOut.getText()); } return null; } private static String safeAppendContentFromString(StringBuffer buffer,String content) { if (content != null) { String contentTrimmed = content.trim(); if (contentTrimmed.length() != 0) { if (buffer.length() != 0) buffer.append(" "); buffer.append(contentTrimmed); } return contentTrimmed; } return null; } private static String safeAppendContentFromContentObj(StringBuffer buffer,Content content) { if (content != null && content.getValue() != null) { String contentTrimmed = content.getValue().trim(); if (contentTrimmed.length() != 0) { if (buffer.length() != 0) buffer.append(" "); buffer.append(contentTrimmed); } return contentTrimmed; } return null; } private static void safeAppendLinksFromFeed(JsonObject feedOrItemObj,ImmutableMap<String,String> validLinkTypes,List<FeedLink> feedMetaLinks,List links)throws IOException { for (Object link : links) { com.sun.syndication.feed.atom.Link linkObj = (com.sun.syndication.feed.atom.Link) link; if (linkObj.getHref() != null && linkObj.getRel() != null) { String canonicalHref = canonicalizeURL(linkObj.getHref()); if (canonicalHref == null) { LOG.error("Failed to Canoniclize Link URL:" + linkObj.getHref()); } else { if (validLinkTypes.keySet().contains(linkObj.getRel())) { JsonObject jsonLink = new JsonObject(); FeedLink metaLink = new FeedLink(); safeSetString(jsonLink, "type", linkObj.getType()); if (linkObj.getType() != null) metaLink.setType(linkObj.getType()); safeSetString(jsonLink, "href",canonicalHref); if (linkObj.getHref() != null) metaLink.setHref(canonicalHref); safeSetString(jsonLink, "rel",linkObj.getRel()); if (linkObj.getRel() != null) metaLink.setRel(linkObj.getRel()); safeSetString(jsonLink, "title", linkObj.getTitle()); if (linkObj.getTitle() != null) metaLink.setTitle(linkObj.getTitle()); feedMetaLinks.add(metaLink); String linkName = validLinkTypes.get(linkObj.getRel()); JsonElement existing = feedOrItemObj.get(linkName); if (existing != null) { JsonArray array = null; if (!existing.isJsonArray()) { array = new JsonArray(); array.add(existing); feedOrItemObj.remove(linkName); feedOrItemObj.add(linkName, array); } else { array = existing.getAsJsonArray(); } array.add(jsonLink); } else { feedOrItemObj.add(linkName,jsonLink); } } } } } } private static void safeAppendAuthorsFromFeed(JsonObject feedOrItemObj,List<FeedAuthor> metaAuthorList,List authors)throws IOException { if (authors.size() != 0) { JsonArray authorArray = new JsonArray(); for (Object author : authors) { com.sun.syndication.feed.atom.Person authorObj = (com.sun.syndication.feed.atom.Person) author; if (authorObj.getName() != null) { JsonObject jsonAuthor = new JsonObject(); FeedAuthor metaAuthor = new FeedAuthor(); String canonicalURL = canonicalizeURL(authorObj.getUrl()); safeSetString(jsonAuthor, "name", authorObj.getName()); if (canonicalURL != null) { safeSetString(jsonAuthor, "url", canonicalURL); } if (authorObj.getName() != null) metaAuthor.setName(authorObj.getName()); if (canonicalURL != null) metaAuthor.setUrl(canonicalURL); authorArray.add(jsonAuthor); metaAuthorList.add(metaAuthor); } } feedOrItemObj.add("authors", authorArray); } } private static void safeAppendLinkFromString(JsonObject jsonObj,List<FeedLink> metaLinks,String propertyName,String linkValue)throws IOException { if (linkValue != null && linkValue.length() != 0) { String canonicalURL = canonicalizeURL(linkValue); if (canonicalURL != null) { JsonObject jsonLink = new JsonObject(); FeedLink metaLink = new FeedLink(); jsonLink.addProperty("href",canonicalURL); metaLink.setHref(canonicalURL); jsonObj.add(propertyName, jsonLink); metaLinks.add(metaLink); } //TODO REPORT FAILURE } } private Pair<JsonObject,String> rssFeedToJson(URL url,Channel channelObject,FeedContent feedMeta, Reporter reporter )throws IOException { JsonObject rssObject = new JsonObject(); StringBuffer contentOut = new StringBuffer(); rssObject.addProperty("type","rss-feed"); feedMeta.setType(FeedContent.Type.RSS); String feedTitle = cleanupDescription(channelObject.getTitle()); rssObject.addProperty("title", safeAppendContentFromString(contentOut,feedTitle)); if (feedTitle != null) feedMeta.setTitle(feedTitle); safeAppendLinkFromString(rssObject,feedMeta.getLinks(),"link", channelObject.getLink()); String feedDesc = cleanupDescription(channelObject.getDescription()); rssObject.addProperty("description", safeAppendContentFromString(contentOut,feedDesc)); if (feedDesc != null) feedMeta.setDescription(feedDesc); if (channelObject.getLastBuildDate() != null) { safeSetDate(rssObject,"updated",channelObject.getLastBuildDate()); feedMeta.setUpdated(channelObject.getLastBuildDate().getTime()); } else if (channelObject.getPubDate() != null ) { safeSetDate(rssObject,"updated",channelObject.getPubDate()); feedMeta.setUpdated(channelObject.getPubDate().getTime()); } setRSSCategories(rssObject,feedMeta.getCategories(),contentOut, channelObject.getCategories()); safeSetString(rssObject, "generator", channelObject.getGenerator()); if (channelObject.getGenerator() != null) feedMeta.setGenerator(channelObject.getGenerator()); safeSetInteger(rssObject, "ttl", channelObject.getTtl()); if (channelObject.getTtl() != -1) feedMeta.setTtl(channelObject.getTtl()); JsonArray itemArray = new JsonArray(); for (Object itemObj : channelObject.getItems()) { Item item = (Item)itemObj; JsonObject itemObject = new JsonObject(); FeedItem metaItem = new FeedItem(); String itemTitle =cleanupDescription(item.getTitle()); itemObject.addProperty("title", safeAppendContentFromString(contentOut,itemTitle)); if (itemTitle != null) metaItem.setTitle(itemTitle); String itemDesc = cleanupDescription(item.getDescription()); itemObject.addProperty("description",safeAppendContentFromString(contentOut,itemDesc)); if (itemDesc != null) metaItem.setDescription(itemDesc); safeAppendLinkFromString(itemObject,metaItem.getLinks(), "link", item.getLink()); safeSetString(itemObject, "author", item.getAuthor()); if (item.getAuthor() != null) { FeedAuthor metaAuthor = new FeedAuthor(); metaAuthor.setName(item.getAuthor()); metaItem.getAuthors().add(metaAuthor); } setRSSCategories(itemObject,metaItem.getCategories(),contentOut,item.getCategories()); safeSetString(itemObject, "comments", item.getComments()); safeSetDate(itemObject,"published",item.getPubDate()); if (item.getPubDate() != null) metaItem.setPublished(item.getPubDate().getTime()); if (item.getGuid() != null) { safeSetString(itemObject,"guid",item.getGuid().getValue()); if (item.getGuid().getValue() != null) metaItem.setGuid(item.getGuid().getValue()); } if (item.getContent() != null && item.getContent().getValue() != null) { if (item.getContent().getType() == null || item.getContent().getType().contains("html")) { HTMLContent metaContent = new HTMLContent(); Pair<JsonObject,String> contentTuple = parseHTMLSnippet(url, item.getContent().getValue(),metaContent,reporter); metaItem.getEmbeddedLinks().addAll(metaContent.getLinks()); if (contentTuple.e0 != null) { itemObject.add("content", contentTuple.e0); } if (contentTuple.e1 != null && contentTuple.e1.length() != 0) { safeAppendContentFromString(contentOut, contentTuple.e1); } } } itemArray.add(itemObject); } rssObject.add("items",itemArray); return new Pair<JsonObject,String> (rssObject,contentOut.toString()); } static ImmutableMap<String, String> validFeedLinks = new ImmutableMap.Builder<String,String>() .put("alternate", "link") .build(); static ImmutableMap<String, String> feedEntryLinks = new ImmutableMap.Builder<String,String>() .put("alternate", "link") .put("self", "self") .put("replies", "replies") .build(); private Pair<JsonObject,String> atomFeedToJson(URL url,Feed feedObject,FeedContent feedMeta,Reporter reporter)throws IOException { JsonObject jsonFeed= new JsonObject(); StringBuffer contentOut = new StringBuffer(); jsonFeed.addProperty("type","atom-feed"); feedMeta.setType(FeedContent.Type.ATOM); String title = cleanupDescription(feedObject.getTitle()); jsonFeed.addProperty("title", safeAppendContentFromString(contentOut,title)); if (title != null) feedMeta.setTitle(title); safeAppendLinksFromFeed(jsonFeed, validFeedLinks, feedMeta.getLinks(),feedObject.getAlternateLinks()); safeAppendAuthorsFromFeed(jsonFeed,feedMeta.getAuthors(),feedObject.getAuthors()); if (feedObject.getGenerator() != null) { safeSetString(jsonFeed, "generator", feedObject.getGenerator().getValue()); if (feedObject.getGenerator().getValue() != null) { feedMeta.setGenerator(feedObject.getGenerator().getValue()); } } safeSetDate(jsonFeed, "updated", feedObject.getUpdated()); if (feedObject.getUpdated() != null) { feedMeta.setUpdated(feedObject.getUpdated().getTime()); } setAtomCategories(jsonFeed,feedMeta.getCategories(),contentOut, feedObject.getCategories()); JsonArray itemArray = new JsonArray(); for (Object entry : feedObject.getEntries()) { Entry entryObj = (Entry)entry; JsonObject jsonEntry = new JsonObject(); FeedItem metaItem = new FeedItem(); String itemTitle = cleanupDescription(entryObj.getTitle()); jsonEntry.addProperty("title", safeAppendContentFromString(contentOut,itemTitle)); if (itemTitle != null) metaItem.setTitle(itemTitle); String itemDesc = cleanupDescription(entryObj.getSummary()); jsonEntry.addProperty("description",safeAppendContentFromString(contentOut,itemDesc)); if (itemDesc != null) metaItem.setDescription(itemDesc); safeSetDate(jsonFeed, "published", entryObj.getPublished()); if (entryObj.getPublished() != null) metaItem.setPublished(entryObj.getPublished().getTime()); safeSetDate(jsonFeed, "updated", entryObj.getUpdated()); if (entryObj.getUpdated() != null) metaItem.setUpdated(entryObj.getUpdated().getTime()); safeAppendLinksFromFeed(jsonEntry, feedEntryLinks,metaItem.getLinks(), entryObj.getAlternateLinks()); safeAppendLinksFromFeed(jsonEntry, feedEntryLinks,metaItem.getLinks(), entryObj.getOtherLinks()); safeAppendAuthorsFromFeed(jsonEntry,metaItem.getAuthors(),entryObj.getAuthors()); setAtomCategories(jsonEntry,metaItem.getCategories(),contentOut,entryObj.getCategories()); for (Object content : entryObj.getContents()) { com.sun.syndication.feed.atom.Content contentObj = (com.sun.syndication.feed.atom.Content) content; if (contentObj.getValue() != null && contentObj.getValue().length() != 0) { if (contentObj.getType() == null || contentObj.getType().contains("html")) { HTMLContent metaContent = new HTMLContent(); Pair<JsonObject,String> contentTuple = parseHTMLSnippet(url, contentObj.getValue(),metaContent,reporter); metaItem.getEmbeddedLinks().addAll(metaContent.getLinks()); if (contentTuple.e0 != null) { if (jsonEntry.has("content")) { JsonArray array = null; JsonElement existing = jsonEntry.get("content"); if (!existing.isJsonArray()) { array = new JsonArray(); array.add(existing); jsonEntry.remove("content"); jsonEntry.add("content", array); } else { array = existing.getAsJsonArray(); } array.add(contentTuple.e0); } else { jsonEntry.add("content", contentTuple.e0); } if (contentTuple.e1 != null && contentTuple.e1.length() != 0) { safeAppendContentFromString(contentOut, contentTuple.e1); } } } } } itemArray.add(jsonEntry); } jsonFeed.add("items",itemArray); return new Pair<JsonObject,String>(jsonFeed,contentOut.toString()); } private static final String feedEntryEnd = "</entry>"; private static final String feedItemEnd = "</item>"; private Pair<JsonObject,String> parseFeedDocument(URL baseURL,String rawHeaders, String feedContent,FeedContent feedMeta,boolean truncatedDocument,Reporter reporter) throws IOException { if (truncatedDocument) { LOG.warn("Fixing Up Trancated Doc:" + baseURL); int indexOfEntryEnd = feedContent.lastIndexOf(feedEntryEnd); if (indexOfEntryEnd != -1) { feedContent = feedContent.substring(0,indexOfEntryEnd + feedEntryEnd.length()); feedContent += "</feed>"; } else { int indexOfItemEnd = feedContent.lastIndexOf(feedItemEnd); if (indexOfItemEnd != -1) { feedContent = feedContent.substring(0,indexOfEntryEnd + feedItemEnd.length()); feedContent += "</channel></rss>"; } } } InputSource source = new InputSource(new StringReader(feedContent)); WireFeedInput input = new WireFeedInput(); Pair<JsonObject,String> resultTuple = null; try { WireFeed feed = input.build(source); if (feed != null) { if (feed instanceof Channel) { reporter.incrCounter(Counters.TRYING_RSS_FEED_PARSER, 1); resultTuple = rssFeedToJson(baseURL,(Channel)feed,feedMeta,reporter); reporter.incrCounter(Counters.GOT_RSS_FEED, 1); } else if (feed instanceof Feed) { resultTuple = atomFeedToJson(baseURL,(Feed)feed,feedMeta,reporter); reporter.incrCounter(Counters.GOT_ATOM_FEED, 1); } else { reporter.incrCounter(Counters.FAILED_TO_ID_FEED, 1); LOG.error("Failed to ID Feed:" + baseURL); } } } catch (Exception e) { reporter.incrCounter(Counters.EXCEPTION_DURING_FEED_PARSE, 1); LOG.error("Failed to parse Feed:" + baseURL + "\n ContentLen:" + feedContent.length() + "\n with Exception:" + CCStringUtils.stringifyException(e)); } return resultTuple; } private Pair<String,Pair<TextBytes,FlexBuffer>> populateContentMetadata(URL finalURL,CrawlURL value,Reporter reporter,JsonObject metadata,CrawlMetadata crawlMeta)throws IOException { FlexBuffer contentOut = null; String textOut = null; NIOHttpHeaders finalHeaders = NIOHttpHeaders .parseHttpHeaders(value.getHeaders()); CrawlURLMetadata urlMetadata = new CrawlURLMetadata(); // extract information from http headers ... HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata); // get the mime type ... String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE) ? urlMetadata.getContentType() : "text/html"; metadata.addProperty("mime_type", normalizedMimeType); crawlMeta.setMimeType(normalizedMimeType); // get download size ... int downloadSize = value.getContentRaw().getCount(); // set original content len ... metadata.addProperty("download_size", downloadSize); crawlMeta.setDownloadSize(downloadSize); // set truncation flag if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) { metadata.addProperty("download_truncated", true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated); } if (downloadSize > 0) { // get content type, charset and encoding String encoding = finalHeaders.findValue("Content-Encoding"); boolean isGZIP = false; if (encoding != null && encoding.equalsIgnoreCase("gzip")) { isGZIP = true; } byte[] contentBytes = value.getContentRaw().getReadOnlyBytes(); int contentLen = value.getContentRaw().getCount(); // assume we are going to output original data ... contentOut = new FlexBuffer(contentBytes,0,contentLen); if (isGZIP) { metadata.addProperty("content_is_gzip", isGZIP); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP); UnzipResult unzipResult = null; try { // LOG.info("BEFORE GUNZIP"); unzipResult = GZIPUtils.unzipBestEffort(contentBytes,0,contentLen,CrawlEnvironment.GUNZIP_SIZE_LIMIT); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } if (unzipResult != null && unzipResult.data != null) { if (unzipResult.wasTruncated) { LOG.warn("Truncated Document During GZIP:" + finalURL); reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1); } contentBytes = unzipResult.data.get(); contentLen = unzipResult.data.getCount(); metadata.addProperty("gunzip_content_len",unzipResult.data.getCount()); crawlMeta.setGunzipSize(unzipResult.data.getCount()); // update content out ... contentOut = new FlexBuffer(contentBytes,0,contentLen); } else { metadata.addProperty("gunzip_failed",true); crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed); reporter.incrCounter(Counters.GUNZIP_FAILED, 1); contentBytes = null; contentLen = 0; contentOut = null; } // LOG.info("AFTER GUNZIP"); } if (contentBytes != null) { // ok compute an md5 hash MD5Hash md5Hash = MD5Hash.digest(contentBytes,0,contentLen); metadata.addProperty("md5", md5Hash.toString()); crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(),0,md5Hash.getDigest().length)); // get normalized mime type if (MimeTypeFilter.isTextType(normalizedMimeType)) { // ok time to decode the data into ucs2 ... Pair<Pair<Integer,Charset>,String> decodeResult= CharsetUtils.bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen); // ok write out decode metadata metadata.addProperty("charset_detected", decodeResult.e0.e1.toString()); crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString()); metadata.addProperty("charset_detector", decodeResult.e0.e0); crawlMeta.setCharsetDetector(decodeResult.e0.e0); // add appropriate http header (for detected charset) finalHeaders.add(Constants.ARCFileHeader_DetectedCharset, decodeResult.e0.e1.toString()); // get the content String textContent = decodeResult.e1; // compute simhash long simhash = SimHash.computeOptimizedSimHashForString(textContent); metadata.addProperty("text_simhash", simhash); crawlMeta.setTextSimHash(simhash); // figure out simplified mime type ... MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter.checkMimeTypeDisposition(normalizedMimeType); boolean parseComplete = false; Pair<JsonObject,String> tupleOut = null; // write it out if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) { //LOG.info("Parsing:" + finalURL.toString() + " Headers:" + value.getHeaders() + " ContentLen:" + contentLen); // ok parse as html tupleOut = parseHTMLDocument(finalURL,value.getHeaders(),new FlexBuffer(contentBytes,0,contentLen),crawlMeta.getHtmlContent(),reporter); if (tupleOut == null) { reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1); LOG.error("Unable to Parse as HTML:" + finalURL.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } else { reporter.incrCounter(Counters.PARSED_HTML_DOC, 1); metadata.addProperty("parsed_as", "html"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) { // ok try parse this document as a feed ... tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent,crawlMeta.getFeedContent(),((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0),reporter); if (tupleOut == null) { if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) { reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1); //TODO:HACK //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString()); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } } else { reporter.incrCounter(Counters.PARSED_FEED_URL, 1); metadata.addProperty("parsed_as", "feed"); crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML); parseComplete = true; } } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) { reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1); mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT; } if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) { // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text"); // TODO: FIX THIS BUT PUNT FOR NOW :-( //tupleOut = new Pair<JsonObject,String>(null,textContent); } if (tupleOut != null) { if (tupleOut.e0 != null) { metadata.add("content", tupleOut.e0); } textOut = tupleOut.e1; } } } } return new Pair<String,Pair<TextBytes,FlexBuffer>>(textOut,new Pair<TextBytes,FlexBuffer>(new TextBytes(finalHeaders.toString()),contentOut)); } static void safeSetJsonPropertyFromJsonProperty(JsonObject destinationObj,String destinationProperty,JsonElement sourceObj,String sourceProperty)throws IOException { if (sourceObj != null && sourceObj.isJsonObject()) { JsonElement sourceElement = sourceObj.getAsJsonObject().get(sourceProperty); if (sourceElement != null) { destinationObj.add(destinationProperty, sourceElement); } } } private static String canonicalizeURL(String sourceURL) throws IOException { if (sourceURL != null) { GoogleURL urlObject = new GoogleURL(sourceURL); return URLUtils.canonicalizeURL(urlObject, false); } return null; } int mapCalls = 0; @Override public void map(Text sourceURL, CrawlURL value, OutputCollector<Text, ParseOutput> output,Reporter reporter) throws IOException { // if not marked for early termination .. check if we reached that condition ... if (System.currentTimeMillis() > _killTime) { LOG.error("Expended Max Allowed Time for Mapper! Progress was at:" + _lastProgressValue); // bail from the task ... _terminatedEarly = true; } // if terminated early, just skip processing if (_terminatedEarly) { LOG.error("Mapper Already Terminated Early!"); return; } // ok we are still good to go ... else { // OK, disable this whole code path since we turned off speculative execution for now ... // every 10 map calls ... check with tdc to see if we should fast fail this mapper ... if (++mapCalls % 10 == 0) { String badTaskDataValue = _taskDataClient.queryTaskData(BAD_TASK_TASKDATA_KEY); if (badTaskDataValue != null && badTaskDataValue.length() != 0) { throw new IOException("Fast Failing Blacklisted (by TDC) Mapper"); } } } if (sourceURL.getLength() == 0) { LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL()); return; } try { // allocate parse output ParseOutput parseOutput = new ParseOutput(); // initialize segment id in output upfront ... parseOutput.setDestSegmentId(_segmentId); // json object out ... JsonObject jsonObj = new JsonObject(); // and create a crawl metadata CrawlMetadata metadata = parseOutput.getCrawlMetadata(); // and content (if available) ... Pair<String,Pair<TextBytes,FlexBuffer>> contentOut = null; // canonicalize the url (minimally) String canonicalURL = canonicalizeURL(sourceURL.toString()); // if canonicalization failed... bail early if (canonicalURL == null) { reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1); return; } URL originalURL = null; try { originalURL = new URL(canonicalURL); } catch (MalformedURLException e) { LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1); return; } if (originalURL.getPath().endsWith("/robots.txt")) { reporter.incrCounter(Counters.SKIPPING_ROBOTS_TXT, 1); return; } URL finalURL = originalURL; jsonObj.addProperty("attempt_time",value.getLastAttemptTime()); metadata.setAttemptTime(value.getLastAttemptTime()); // first step write status jsonObj.addProperty("disposition", (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE"); metadata.setCrawlDisposition((byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1)); // deal with redirects ... if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { Pair<URL,JsonObject> redirect = buildRedirectObject(originalURL,value,metadata,reporter); if (redirect == null) { return; } jsonObj.add("redirect_from",redirect.e1); finalURL = redirect.e0; } if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) { jsonObj.addProperty("failure_reason",CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason())); metadata.setFailureReason(value.getLastAttemptFailureReason()); jsonObj.addProperty("failure_detail",value.getLastAttemptFailureDetail()); metadata.setFailureDetail(value.getLastAttemptFailureDetail()); } else { jsonObj.addProperty("server_ip",IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); metadata.setServerIP(value.getServerIP()); jsonObj.addProperty("http_result",value.getResultCode()); metadata.setHttpResult(value.getResultCode()); jsonObj.add("http_headers",httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders()))); metadata.setHttpHeaders(value.getHeaders()); jsonObj.addProperty("content_len",value.getContentRaw().getCount()); metadata.setContentLength(value.getContentRaw().getCount()); if (value.getResultCode() >= 200 && value.getResultCode() <= 299 && value.getContentRaw().getCount() > 0) { contentOut = populateContentMetadata(finalURL,value,reporter,jsonObj,metadata); if (metadata.isFieldDirty(CrawlMetadata.Field_CHARSETDETECTED)) { parseOutput.setDetectedCharset(metadata.getCharsetDetected()); } } } // ok ... write stuff out ... reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1); ////////////////////////////////////////////////////////////// // echo some stuff to parseOutput ... parseOutput.setMetadata(jsonObj.toString()); JsonElement mimeType = jsonObj.get("mime_type"); if (mimeType != null) { parseOutput.setNormalizedMimeType(mimeType.getAsString()); } JsonElement md5 = jsonObj.get("md5"); if (md5 != null) { MD5Hash hash = new MD5Hash(md5.getAsString()); byte[] bytes = hash.getDigest(); parseOutput.setMd5Hash(new FlexBuffer(bytes,0,bytes.length)); } JsonElement simHash = jsonObj.get("text_simhash"); if (simHash != null) { parseOutput.setSimHash(simHash.getAsLong()); } parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); parseOutput.setFetchTime(value.getLastAttemptTime()); //////////////////////////////////////////////////////////// if (contentOut != null) { if (contentOut.e0 != null) { parseOutput.setTextContent(contentOut.e0); reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1); } if (contentOut.e1 != null) { // directly set the text bytes ... parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0); // mark it dirty !!! parseOutput.setFieldDirty(ParseOutput.Field_HEADERS); // if content available ... if (contentOut.e1.e1 != null) { parseOutput.setRawContent(contentOut.e1.e1); } reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1); } } //buildCompactMetadata(parseOutput,jsonObj,urlMap); output.collect(new Text(finalURL.toString()), parseOutput); } catch (Exception e) { LOG.error("Exception Processing URL:" + sourceURL.toString() + "\n" + CCStringUtils.stringifyException(e)); if (e instanceof IOException) reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1); else reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1); } catch (OutOfMemoryError e) { LOG.fatal("Got Out of Memory Error Processing URL:" + sourceURL.toString() + "\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.GOT_OUT_OF_MEMORY_ERROR, 1); // bail from the remainder of the map task _terminatedEarly = true; } } /** * inform tdc of successful task completion ... * @throws IOException */ public void commitTask(Reporter reporter) throws IOException { if (_terminatedEarly) { OutputCommitter.setTaskDataCommitInfo(BAD_TASK_TASKDATA_KEY, getRemainingSplitInfo()); // _taskDataClient.updateTaskData(BAD_TASK_TASKDATA_KEY, getRemainingSplitInfo()); reporter.incrCounter(Counters.PARTIALLY_PROCESSED_SPLIT, 1); } else { OutputCommitter.setTaskDataCommitInfo(GOOD_TASK_TASKDATA_KEY, getOriginalSplitInfo()); // _taskDataClient.updateTaskData(GOOD_TASK_TASKDATA_KEY, getOriginalSplitInfo()); reporter.incrCounter(Counters.FULLY_PROCESSED_SPLIT, 1); } } public void updateProgressAndPosition(double progress,long position) { _lastProgressValue = progress; _lastPosition = position; } double _lastProgressValue; long _lastPosition = 0L; long _segmentId; long _startTime; long _killTime; long _maxRunTime; boolean _terminatedEarly = false; TaskDataClient _taskDataClient; String _splitFile; long _splitStartPos; long _splitLength; /** * * @return true if this mapper exited early due to a timeout ... */ boolean wasTerminatedEarly() { return _terminatedEarly; } /** * * @return the orignal split information */ String getOriginalSplitInfo() { return _splitFile + ":" +_splitStartPos + "+" + _splitLength; } /** * * @return the unprocessed portion of the split (after early termination) */ String getRemainingSplitInfo() { // calculate remaining split length ... long splitRemaining = _splitLength - (_lastPosition - _splitStartPos); // and return a split for the remaining (unprocessed) portion of the split ... return _splitFile + ":" + _lastPosition + "+" + splitRemaining + "," + getOriginalSplitInfo(); } @Override public void configure(JobConf job) { LOG.info("LIBRARY PATH:" + System.getenv().get("LD_LIBRARY_PATH")); _segmentId = job.getLong("cc.segmet.id", -1L); LOG.info("Job Conf says Segment Id is:" + _segmentId); _startTime = System.currentTimeMillis(); _maxRunTime = job.getLong(MAX_MAPPER_RUNTIME_PROPERTY, DEFAULT_MAX_MAPPER_RUNTIME); LOG.info("Job Max Runtime (per config) is:" + _maxRunTime); _killTime = _startTime + _maxRunTime; // initialize the Task Data Client ... try { _taskDataClient = TaskDataUtils.getTaskDataClientForTask(job); } catch (IOException e) { LOG.fatal("Unable to Initialize Task Data Client with Error:" + CCStringUtils.stringifyException(e)); // hard fail throw new RuntimeException("Unable to Initialize Task Data Client with Error:" + CCStringUtils.stringifyException(e)); } _splitFile = job.get("map.input.file"); _splitStartPos = job.getLong("map.input.start",-1); _splitLength = job.getLong("map.input.length",-1); } @Override public void close() throws IOException { _taskDataClient.shutdown(); } private static class MockReporter implements Reporter { @Override public Counter getCounter(Enum<?> name) { return null; } @Override public Counter getCounter(String group, String name) { return null; } @Override public InputSplit getInputSplit() throws UnsupportedOperationException { return null; } @Override public void incrCounter(Enum<?> key, long amount) {} @Override public void incrCounter(String group, String counter, long amount) {} @Override public void setStatus(String status) {} @Override public void progress() {} //@Override public float getProgress() { return 0; } } /** * some test code ... * * @param args * @throws IOException */ public static void main(String[] args)throws IOException { Configuration conf = new Configuration(); Path pathToCrawlLog = new Path(args[0]); FileSystem fs = FileSystem.get(pathToCrawlLog.toUri(),conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, pathToCrawlLog, conf); Text url = new Text(); CrawlURL urlData = new CrawlURL(); ParserMapper mapper = new ParserMapper(); MockReporter reporter = new MockReporter(); final JsonParser parser = new JsonParser(); while (reader.next(url, urlData)) { mapper.map(url, urlData, new OutputCollector<Text, ParseOutput>() { @Override public void collect(Text key, ParseOutput value) throws IOException { long timeStart = System.currentTimeMillis(); JsonObject metadata = parser.parse(new JsonReader(new StringReader(value.getMetadata()))).getAsJsonObject(); long timeEnd = System.currentTimeMillis(); System.out.println("Key:" + key.toString() + " Parse Took:" + (timeEnd-timeStart)); System.out.println("Key:" + key.toString() + " Metadata Size:" + value.getMetadataAsTextBytes().getLength()); System.out.println("Key:" + key.toString() + " Text-Size" + value.getTextContentAsTextBytes().getLength()); System.out.println("Key:" + key.toString() + " RAW-Size" + value.getRawContent().getCount()); System.out.println("Key:" + key.toString() + " Metadata:"); System.out.println(JSONUtils.prettyPrintJSON(metadata)); System.out.println("Key:" + key.toString() + " Text:"); System.out.println(value.getTextContent()); } }, reporter); } reader.close(); } }