/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.queryserver.master;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.file.tfile.TFile;
import org.commoncrawl.util.CrawlDatum;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.ArchiveInfo;
import org.commoncrawl.protocol.CrawlDatumAndMetadata;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.protocol.SubDomainMetadata;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.protocol.shared.ArcFileHeaderItem;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.service.queryserver.ClientQueryInfo;
import org.commoncrawl.service.queryserver.DomainListQueryInfo;
import org.commoncrawl.service.queryserver.DomainURLListQueryInfo;
import org.commoncrawl.service.queryserver.InlinkingDomainInfo;
import org.commoncrawl.service.queryserver.InlinksByDomainQueryInfo;
import org.commoncrawl.service.queryserver.URLLinkDetailQueryInfo;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.MasterDatabaseIndex.MetadataOut;
import org.commoncrawl.service.queryserver.master.MasterServer.BlockingQueryResult;
import org.commoncrawl.service.queryserver.query.DomainListQuery;
import org.commoncrawl.service.queryserver.query.DomainURLListQuery;
import org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery;
import org.commoncrawl.service.queryserver.query.QueryResult;
import org.commoncrawl.service.queryserver.query.QueryResultRecord;
import org.commoncrawl.service.queryserver.query.URLLinksQuery;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.MurmurHash;
import org.commoncrawl.util.ProtocolStatus;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;
import org.mortbay.jetty.servlet.Context;
import org.mortbay.jetty.servlet.DefaultServlet;
import com.google.gson.stream.JsonWriter;
/**
*
* @author rana
*
*/
public class QueryServerFE {
private static final Log LOG = LogFactory.getLog(QueryServerFE.class);
private static MasterServer _server;
private File _webAppRoot;
public static MasterServer getServer() { return _server; }
public QueryServerFE(MasterServer server, File webAppRoot) throws IOException {
_server = server;
_webAppRoot = webAppRoot;
_server.getWebServer().addServlet("domainListQuery","/domainListQuery.jsp",DomainListQueryServlet.class);
_server.getWebServer().addServlet("domainDetail","/domainDetail.jsp",DomainDataQueryServlet.class);
_server.getWebServer().addServlet("urlDetail","/urlDetail.jsp",URLDetailServlet.class);
_server.getWebServer().addServlet("linkDetail","/linkDetail.jsp",LinkDetailsServlet.class);
_server.getWebServer().addServlet("contentDetail","/getCachedContent.jsp",URLContentServlet.class);
_server.getWebServer().addServlet("getCrawlList","/getCrawlList.jsp",CrawlListServlet.class);
LOG.info("Adding GetInverseByDomain Servet");
_server.getWebServer().addServlet("getInverseByDomain","/getInverseLinksByDomain.jsp",InverseURLListByRootDomainQueryServlet.class);
Context staticContext = new Context(_server.getWebServer().getContextHandlerCollection(),"/img");
staticContext.setResourceBase(_webAppRoot.getAbsolutePath() + "/img/");
staticContext.addServlet(DefaultServlet.class, "/");
staticContext = new Context(_server.getWebServer().getContextHandlerCollection(),"/css");
staticContext.setResourceBase(_webAppRoot.getAbsolutePath() + "/css/");
staticContext.addServlet(DefaultServlet.class, "/");
staticContext = new Context(_server.getWebServer().getContextHandlerCollection(),"/js");
staticContext.setResourceBase(_webAppRoot.getAbsolutePath() + "/js/");
staticContext.addServlet(DefaultServlet.class, "/");
}
@SuppressWarnings("serial")
public static class DomainListQueryServlet extends HttpServlet {
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response)throws ServletException, IOException {
//LOG.info("Received Request:" +request.toString());
try {
String pattern = request.getParameter("pattern");
int page_no = Integer.parseInt(request.getParameter("page_no")) - 1;
int page_size = Integer.parseInt(request.getParameter("page_size"));
String sortBy = request.getParameter("sort_by");
String sortOrder = request.getParameter("sort_order");
String renderType = request.getParameter("render_type");
if (pattern == null) {
throw new IOException("Invalid Search Pattern Specified");
}
// ok see if this is a valid url ...
GoogleURL urlObject = new GoogleURL(pattern);
if (urlObject.isValid()) {
// ok, detected a direct url query ...
// formulate a specific response ...
PrintWriter writer = response.getWriter();
JsonWriter jsonWriter = new JsonWriter(writer);
try {
jsonWriter.beginObject();
jsonWriter.name("isURL");
jsonWriter.value(1);
jsonWriter.name("domainName");
jsonWriter.value(urlObject.getHost());
jsonWriter.endObject();
return;
} catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new IOException(e);
}
}
// build domain query info
DomainListQueryInfo queryInfo = new DomainListQueryInfo();
// set search pattern parameter
queryInfo.setSearchPattern(pattern);
// initialize paging info
ClientQueryInfo clientQueryInfo = new ClientQueryInfo();
clientQueryInfo.setSortByField(sortBy);
clientQueryInfo.setPageSize(page_size);
clientQueryInfo.setPaginationOffset(page_no);
if (sortOrder.equalsIgnoreCase("ASC"))
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.ASCENDING);
else
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.DESCENDING);
DomainListQuery query = new DomainListQuery(queryInfo);
try {
BlockingQueryResult<Text, SubDomainMetadata> result = null;
// ok first see if the search pattern is actually a domain name
if (URLUtils.isValidDomainName(pattern)) {
LOG.info("Pattern like a valid domain:" + pattern);
// ok in this case, do a direct query to get domain metadata ...
SubDomainMetadata metadata = _server.getDatabaseIndex().queryDomainMetadataGivenDomainName(pattern);
// ok construct a result object
QueryResult<Text, SubDomainMetadata> resultInner = new QueryResult<Text, SubDomainMetadata>();
result = new BlockingQueryResult<Text, SubDomainMetadata>(resultInner);
result.querySucceeded = true;
if (metadata != null) {
LOG.info("Found Metadata for Domain:" + pattern);
// populate it
resultInner.getResults().add(new QueryResultRecord<Text, SubDomainMetadata>(new Text(pattern), metadata));
resultInner.setTotalRecordCount(1);
}
else {
LOG.info("Failed to find Metadata for Domain:" + pattern);
}
}
else {
// ok ... see if this is a regular expression
try {
Pattern.compile(pattern);
}
catch (PatternSyntaxException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new IOException("Invalid Regular Expression Syntax in Search Pattern!");
}
// ok at this point assume pattern is good .. do a parallel query ...
result = _server.blockingQueryRequest(query,clientQueryInfo);
}
if (result.querySucceeded) {
OutputStream outStream;
try {
outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
if (renderType != null && renderType.equalsIgnoreCase("x-json")) {
response.setContentType("application/x-json");
}
else {
response.setContentType("text/plain");
}
writer.write("{\"isURL\":0,\"total\":"+result.resultObject.getTotalRecordCount() +",");
writer.write("\"page\":"+ (page_no + 1) +",");
writer.write("\"rows\":[");
int count = 0;
for (QueryResultRecord<Text,SubDomainMetadata> record : result.resultObject.getResults()) {
if (count++ != 0)
writer.write(",");
writer.write("{ cell:[");
writer.write(quote(record.getKey().toString()) + ",");
writer.print(record.getValue().getUrlCount());
writer.write("] }\n");
}
writer.append("]}");
writer.flush();
outStream.close();
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else {
OutputStream outStream;
try {
outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
response.setContentType("text/plain");
writer.append("Query Failed with Error:\n");
writer.append(result.errorString);
writer.flush();
outStream.close();
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
} catch (IOException e) {
LOG.error("Query Failed with Exception:" + CCStringUtils.stringifyException(e));
throw e;
}
}
catch (Exception e) {
throw new IOException(CCStringUtils.stringifyException(e));
}
}
}
private static final HashMap<Integer, String> codeToName =
new HashMap<Integer, String>();
static {
codeToName.put(new Integer(ProtocolStatus.FAILED), "failed");
codeToName.put(new Integer(ProtocolStatus.GONE), "gone");
codeToName.put(new Integer(ProtocolStatus.MOVED), "moved");
codeToName.put(new Integer(ProtocolStatus.TEMP_MOVED), "temp_moved");
codeToName.put(new Integer(ProtocolStatus.NOTFOUND), "notfound");
codeToName.put(new Integer(ProtocolStatus.RETRY), "retry");
codeToName.put(new Integer(ProtocolStatus.EXCEPTION), "exception");
codeToName.put(new Integer(ProtocolStatus.ACCESS_DENIED), "access_denied");
codeToName.put(new Integer(ProtocolStatus.ROBOTS_DENIED), "robots_denied");
codeToName.put(new Integer(ProtocolStatus.REDIR_EXCEEDED), "redir_exceeded");
codeToName.put(new Integer(ProtocolStatus.NOTFETCHING), "notfetching");
codeToName.put(new Integer(ProtocolStatus.NOTMODIFIED), "notmodified");
}
public static String getStatusStringFromMetadata(CrawlDatumAndMetadata metadata) {
String status = CrawlDatum.getStatusName(metadata.getStatus()).substring("db_".length());
if (metadata.isFieldDirty(CrawlDatumAndMetadata.Field_PROTOCOLSTATUS)) {
if (metadata.getProtocolStatus() > ProtocolStatus.SUCCESS) {
String protocolStatus = codeToName.get((int)metadata.getProtocolStatus());
if (protocolStatus != null) {
status += "-" + protocolStatus;
}
}
}
return status;
}
@SuppressWarnings("serial")
public static class DomainDataQueryServlet extends HttpServlet {
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response)throws ServletException, IOException {
LOG.info("Received Request:" +request.toString());
String errorResult = null;
try {
String requestType = request.getParameter("request_type");
String domainName = request.getParameter("name");
String renderType = request.getParameter("render_type");
String domainIdParam = request.getParameter("id");
long domainId = -1;
if (domainIdParam != null) {
domainId = Long.parseLong(domainIdParam);
}
if (requestType.equalsIgnoreCase("domainStats")) {
if (domainId == -1) {
// get domain name and map it to a domain id
domainId = _server.getDatabaseIndex().queryDomainIdGivenDomain(domainName);
}
// get domain metadata
SubDomainMetadata metadata = _server.getDatabaseIndex().queryDomainMetadataGivenDomainId(domainId);
if (metadata != null) {
OutputStream outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
try {
if (renderType == null || renderType.equalsIgnoreCase("x-json")) {
response.setContentType("application/x-json");
}
else {
response.setContentType("text/plain");
}
//response.setContentType("text/plain");
writer.write("{\"name\":"+ quote(metadata.getDomainText()) +",");
writer.write("\"urls\":"+ metadata.getUrlCount() +",");
writer.write("\"fetched\":"+ metadata.getFetchedCount() +",");
writer.write("\"gone\":"+ metadata.getGoneCount() +",");
writer.write("\"redirectsPerm\":"+ metadata.getRedirectPermCount() +",");
writer.write("\"redirectsTemp\":"+ metadata.getRedirectTemporaryCount() +",");
writer.write("\"pageRank\":"+ metadata.getHasPageRankCount() +",");
writer.write("\"arcfileInfo\":"+ metadata.getHasArcFileInfoCount() +",");
writer.write("\"outlinkData\":"+ metadata.getHasLinkListCount() +",");
writer.write("\"inlinkData\":"+ metadata.getHasInverseLinkListCount() +",");
writer.write("}");
writer.flush();
outStream.close();
}
catch (IOException e) {
errorResult = CCStringUtils.stringifyException(e);
LOG.error(errorResult);
throw e;
}
}
else {
errorResult = "Unable to locate metadata from Domain:" + domainName;
LOG.error(errorResult);
throw new IOException(errorResult);
}
}
else if (requestType.equalsIgnoreCase("urlList")) {
// build domain query info
final DomainURLListQueryInfo queryInfo = new DomainURLListQueryInfo();
if (domainId == -1) {
// get domain name and map it to a domain id
domainId = _server.getDatabaseIndex().queryDomainIdGivenDomain(domainName);
}
// set it into the query info
queryInfo.setDomainId(domainId);
// create query object ...
DomainURLListQuery query = new DomainURLListQuery(queryInfo);
// initialize pagination info
int page_no = -1;
int page_size = 0;
String sortBy = "";
String sortOrder = "";
// initialize paging info
ClientQueryInfo clientQueryInfo = new ClientQueryInfo();
if (request.getParameter("page_no") == null
|| request.getParameter("page_size") == null
|| request.getParameter("sort_by") == null
|| request.getParameter("sort_order") == null) {
errorResult = "Invalid Pagination Data";
LOG.error(errorResult);
throw new IOException(errorResult);
}
page_no = Integer.parseInt(request.getParameter("page_no")) - 1;
page_size = Math.min(Math.max(1,Integer.parseInt(request.getParameter("page_size"))),1000);
sortBy = request.getParameter("sort_by");
sortOrder = request.getParameter("sort_order");
clientQueryInfo.setSortByField(sortBy);
clientQueryInfo.setPageSize(page_size);
clientQueryInfo.setPaginationOffset(page_no);
if (sortOrder.equalsIgnoreCase("ASC"))
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.ASCENDING);
else
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.DESCENDING);
// issue a blocking query request ...
BlockingQueryResult<URLFPV2, CrawlDatumAndMetadata> result = _server.blockingQueryRequest(query,clientQueryInfo);
if (result.querySucceeded) {
OutputStream outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
try {
if (renderType != null && renderType.equalsIgnoreCase("x-json")) {
response.setContentType("application/x-json");
}
else {
response.setContentType("text/plain");
}
writer.write("{\"total\":"+result.resultObject.getTotalRecordCount() +",");
writer.write("\"page\":"+ (page_no + 1) +",");
writer.write("\"rows\":[");
int recordCount = 0;
for (QueryResultRecord<URLFPV2,CrawlDatumAndMetadata> record : result.resultObject.getResults()) {
if (recordCount++ != 0)
writer.write(",");
writer.write("{ cell:[");
//URL
writer.write(quote(record.getValue().getUrl())+",");
//STATUS
writer.write(quote(getStatusStringFromMetadata(record.getValue()))+",");
// FETCHTIME
if (record.getValue().getMetadata().isFieldDirty(CrawlURLMetadata.Field_LASTFETCHTIMESTAMP)) {
writer.write(quote(new Date(record.getValue().getMetadata().getLastFetchTimestamp()).toString())+",");
}
else {
writer.write(",");
}
// PAGE RANK
writer.print(record.getValue().getMetadata().getPageRank());
writer.write(",");
//CRAWLNO
writer.print(record.getValue().getMetadata().getCrawlNumber());
writer.write(",");
// PARSENO
writer.print(record.getValue().getMetadata().getParseNumber());
writer.write(",");
// UPLOADNO
writer.print(record.getValue().getMetadata().getUploadNumber());
writer.write("]}\n");
}
writer.write("]}");
writer.flush();
outStream.close();
}
catch (IOException e) {
errorResult = CCStringUtils.stringifyException(e);
LOG.error(errorResult);
}
}
else {
errorResult = result.errorString;
}
}
if (errorResult != null) {
response.setContentType("text/plain");
OutputStream outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
writer.write(errorResult);
writer.close();
outStream.flush();
}
}
catch (Exception e) {
throw new IOException(CCStringUtils.stringifyException(e));
}
}
}
private static SimpleDateFormat S3_TIMESTAMP_FORMAT = new SimpleDateFormat("yyyy/MM/dd/");
private static String hdfsNameToS3ArcFileName(long arcFileDate,int arcFilePartNo) {
String arcFileName = Long.toString(arcFileDate) + "_" + arcFilePartNo + ".arc.gz";
synchronized (S3_TIMESTAMP_FORMAT) {
return S3_TIMESTAMP_FORMAT.format(new Date(arcFileDate)) +arcFilePartNo + "/" + arcFileName;
}
}
@SuppressWarnings("serial")
public static class URLDetailServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response)throws ServletException, IOException {
LOG.info("Received Request:" +request.toString());
try {
String urlName = request.getParameter("name");
if (urlName == null) {
throw new IOException("name parameter not specified!");
}
else {
urlName = URLDecoder.decode(urlName,"UTF-8");
}
// compute fingerprint for given url
URLFPV2 fingerprint = URLUtils.getURLFPV2FromURL(urlName);
// ok if null error out
if (fingerprint == null) {
throw new IOException("Invalid URL:" + urlName);
}
String renderType = request.getParameter("render_type");
try {
// ok query the master index for the metadata related to the url
MetadataOut metadataOut = _server.getDatabaseIndex().queryMetadataAndURLGivenFP(fingerprint);
//TODO:HACK - REMOVE LATER!!!
if (metadataOut == null) {
HackMetadataOut hackedMetadata = hackTryAlternativeRouteToGetMetadata(urlName,fingerprint);
if (hackedMetadata != null) {
metadataOut = hackedMetadata.metadataOut;
urlName = hackedMetadata.alternativeURL;
}
}
if (metadataOut != null && metadataOut.datumAndMetadataBytes.getLength() != 0) {
DataInputBuffer readerStream = new DataInputBuffer();
CrawlDatumAndMetadata realMetadataObject = new CrawlDatumAndMetadata();
readerStream.reset(metadataOut.datumAndMetadataBytes.getBytes(),metadataOut.datumAndMetadataBytes.getOffset(),metadataOut.datumAndMetadataBytes.getLength());
realMetadataObject.readFields(readerStream);
OutputStream outStream;
try {
outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
if (renderType != null && renderType.equalsIgnoreCase("x-json")) {
response.setContentType("application/x-json");
}
else {
response.setContentType("text/plain");
}
writer.write("{\n");
// DOMAIN HASH
writer.write("domainHash:");
writer.write(quote(Long.toString(fingerprint.getDomainHash()))+"\n,");
// FINGERPRINT
writer.write("urlHash:");
writer.write(quote(Long.toString(fingerprint.getUrlHash()))+"\n,");
// CANONICAL URL
writer.write("canonicalURL:");
writer.write(quote(urlName)+"\n,");
//STATUS
writer.write("status:");
writer.write(quote(getStatusStringFromMetadata(realMetadataObject))+"\n,");
//CONTENT LOCATION
//writer.write("contentLocation:");
//writer.write(quote(data.getMetadata().getContentFileNameAndPos())+"\n,");
writer.write("hasArcFileData:");
writer.print(realMetadataObject.getMetadata().getArchiveInfo().size() != 0 ? 1 : 0 );
writer.write("\n,");
if (realMetadataObject.getMetadata().getArchiveInfo().size() != 0) {
Collections.sort(realMetadataObject.getMetadata().getArchiveInfo(), new Comparator<ArchiveInfo> () {
@Override
public int compare(ArchiveInfo o1, ArchiveInfo o2) {
return (o1.getArcfileDate() < o2.getArcfileDate()) ? -1 : (o1.getArcfileDate() > o2.getArcfileDate()) ? 1 : 0;
}
});
ArchiveInfo info = realMetadataObject.getMetadata().getArchiveInfo().get(realMetadataObject.getMetadata().getArchiveInfo().size()-1);
//ARC FILE DATE
writer.write("arcFileDate:");
writer.print(info.getArcfileDate());
writer.write("\n,");
//ARC FILE INDEX
writer.write("arcFileIndex:");
writer.print(info.getArcfileDate());
writer.write("\n,");
//ARC FILE OFFSET
writer.write("arcFileOffset:");
writer.print(info.getArcfileOffset());
writer.write("\n,");
// ARC FILE PATH
writer.write("arcFilePath:");
writer.print(quote(hdfsNameToS3ArcFileName(info.getArcfileDate(), info.getArcfileIndex())));
writer.write("\n,");
// ARC FILE SIZE
writer.write("arcFileCompressedSize:");
writer.print(info.getCompressedSize());
writer.write("\n,");
}
//CONTENT TYPE
writer.write("contentType:");
writer.write("\"" + realMetadataObject.getMetadata().getContentType() + "\"");
writer.write("\n,");
writer.write("hasFetchMetadata:");
writer.print(realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_LASTFETCHTIMESTAMP) ? 1 : 0 );
writer.write("\n,");
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_LASTFETCHTIMESTAMP)) {
// FETCHTIME
writer.write("lastFetchTime:");
writer.print(realMetadataObject.getMetadata().getLastFetchTimestamp());
writer.write("\n,");
//LAST FETCH SIZE
writer.write("lastFetchSize:");
writer.print(realMetadataObject.getMetadata().getLastFetchSize());
writer.write("\n,");
}
// PAGE RANK
writer.write("pageRank:");
writer.print(realMetadataObject.getMetadata().getPageRank());
writer.write("\n,");
if (realMetadataObject.getRedirectLocation().length() !=0) {
writer.write("RedirectLocation:");
writer.print(quote(realMetadataObject.getRedirectLocation()));
writer.write("\n,");
String hostName = URLUtils.fastGetHostFromURL(realMetadataObject.getRedirectLocation());
if (hostName == null)
hostName = "";
writer.write("RedirectDomain:");
writer.print(quote(hostName));
writer.write("\n,");
}
if (realMetadataObject.getModifiedTime() > 0) {
writer.write("metadataLastModified:");
writer.print(realMetadataObject.getModifiedTime());
writer.write("\n,");
}
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_HTTPDATE)) {
writer.write("httpDate:");
writer.print(realMetadataObject.getMetadata().getHttpDate());
writer.write("\n,");
}
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME)) {
writer.write("httpLastModified:");
writer.print(realMetadataObject.getMetadata().getLastModifiedTime());
writer.write("\n,");
}
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_MAXAGE)) {
writer.write("maxAge:");
writer.print(realMetadataObject.getMetadata().getMaxAge());
writer.write("\n,");
}
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_EXPIRES)) {
writer.write("httpExpires:");
writer.print(realMetadataObject.getMetadata().getExpires());
writer.write("\n,");
}
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_ETAG)) {
writer.write("httpETag:");
writer.print(quote(realMetadataObject.getMetadata().getETag()));
writer.write("\n,");
}
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_CACHECONTROLFLAGS) && realMetadataObject.getMetadata().getCacheControlFlags() != 0 ) {
writer.write("httpCacheFlags:");
String flags = "";
if ((CrawlURLMetadata.CacheControlFlags.NO_CACHE & realMetadataObject.getMetadata().getCacheControlFlags()) != 0) {
flags += "nocache ";
}
if ((CrawlURLMetadata.CacheControlFlags.NO_STORE & realMetadataObject.getMetadata().getCacheControlFlags()) != 0) {
flags += "nostore ";
}
if ((CrawlURLMetadata.CacheControlFlags.VARY & realMetadataObject.getMetadata().getCacheControlFlags()) != 0) {
flags += "vary ";
}
if ((CrawlURLMetadata.CacheControlFlags.MUST_REVALIDATE & realMetadataObject.getMetadata().getCacheControlFlags()) != 0) {
flags += "must_revalidate ";
}
if ((CrawlURLMetadata.CacheControlFlags.PRIVATE & realMetadataObject.getMetadata().getCacheControlFlags()) != 0) {
flags += "private ";
}
writer.print(quote(flags));
writer.write("\n,");
}
writer.write("hasOutlinkData:");
writer.print(realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_LINKDBOFFSET) ? 1 : 0 );
writer.write("\n,");
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_LINKDBOFFSET)) {
// LINKDBTIMESTAMP
writer.write("linkdbTS:");
writer.print(realMetadataObject.getMetadata().getLinkDBTimestamp());
writer.write("\n,");
// LINKDBTIMEFILENO
writer.write("linkdbFileNo:");
writer.print(realMetadataObject.getMetadata().getLinkDBFileNo());
writer.write("\n,");
// LINKDBPOFFSET
writer.write("linkdbOffset:");
writer.print(realMetadataObject.getMetadata().getLinkDBOffset());
writer.write("\n,");
}
writer.write("hasInlinkData:");
writer.print(realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_INVERSEDBOFFSET) ? 1 : 0 );
writer.write("\n");
if (realMetadataObject.getMetadata().isFieldDirty(CrawlURLMetadata.Field_INVERSEDBOFFSET)) {
writer.write(",");
// INVLINKDBTIMESTAMP
writer.write("InverseLinkdbTS:");
writer.print(realMetadataObject.getMetadata().getInverseDBTimestamp());
writer.write("\n,");
// LINKDBTIMEFILENO
writer.write("InverseLinkdbFileNo:");
writer.print(realMetadataObject.getMetadata().getInverseDBFileNo());
writer.write("\n,");
// LINKDBPOFFSET
writer.write("InverseLinkdbOffset:");
writer.print(realMetadataObject.getMetadata().getInverseDBOffset());
}
writer.write("}\n");
writer.flush();
outStream.close();
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else {
OutputStream outStream;
try {
outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
response.setContentType("text/plain");
writer.append("NO METADATA AVAILABLE!:\n");
writer.flush();
outStream.close();
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
} catch (IOException e) {
LOG.error("Query Failed with Exception:" + CCStringUtils.stringifyException(e));
throw e;
}
}
catch (Exception e) {
throw new IOException(CCStringUtils.stringifyException(e));
}
}
}
@SuppressWarnings("serial")
public static class LinkDetailsServlet extends HttpServlet {
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response)throws ServletException, IOException {
LOG.info("Received Request:" +request.toString());
String errorResult = null;
try {
String urlName = request.getParameter("name");
String renderType = request.getParameter("render_type");
String queryTypeStr = request.getParameter("query_type");
int queryType = -1;
if (queryTypeStr != null) {
if (queryTypeStr.equalsIgnoreCase(URLLinkDetailQueryInfo.QueryType.toString(URLLinkDetailQueryInfo.QueryType.LINKS_QUERY))) {
queryType = URLLinkDetailQueryInfo.QueryType.LINKS_QUERY;
}
else if (queryTypeStr.equalsIgnoreCase(URLLinkDetailQueryInfo.QueryType.toString(URLLinkDetailQueryInfo.QueryType.INVERSE_QUERY))) {
queryType = URLLinkDetailQueryInfo.QueryType.INVERSE_QUERY;
}
else if (queryTypeStr.equalsIgnoreCase(URLLinkDetailQueryInfo.QueryType.toString(URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_QUERY))) {
queryType = URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_QUERY;
}
else if (queryTypeStr.equalsIgnoreCase(URLLinkDetailQueryInfo.QueryType.toString(URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_DETAIL_QUERY))) {
queryType = URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_DETAIL_QUERY;
}
}
// build domain query info
final URLLinkDetailQueryInfo queryInfo = new URLLinkDetailQueryInfo();
// set query type
queryInfo.setQueryType(queryType);
if (urlName == null || urlName.length() ==0 || queryType == -1) {
LOG.error("urlName:" + urlName + " query_type:" + queryTypeStr);
response.setStatus(404);
return;
}
urlName = URLDecoder.decode(urlName,"UTF-8");
URLFPV2 fingerprint = URLUtils.getURLFPV2FromURL(urlName);
if (fingerprint == null) {
throw new IOException("Invalid URL Passed Into Query: " + urlName);
}
queryInfo.setTargetURLFP(fingerprint);
if (queryType != URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_DETAIL_QUERY) {
// we need metadata location information ...
// check to see the link db information is present in the request
String linkdbTS = request.getParameter("linkdbTS");
String linkdbFileNo = request.getParameter("linkdbFileNo");
String linkdbOffset = request.getParameter("linkdbOffset");
if (linkdbTS == null || linkdbFileNo == null || linkdbOffset == null) {
//LOG.info("LinkDB Location Hint not present in query. Issuing URL Detail Query to obtain information");
// need to query the local master index for the metadata
MetadataOut metadataOut = _server.getDatabaseIndex().queryMetadataAndURLGivenFP(fingerprint);
//TODO: HACK - REMOVE LATER !!!
if (metadataOut == null) {
HackMetadataOut hacked = hackTryAlternativeRouteToGetMetadata(urlName,fingerprint);
if (hacked != null) {
metadataOut = hacked.metadataOut;
}
}
if (metadataOut == null || metadataOut.datumAndMetadataBytes.getLength() == 0) {
//LOG.info("Unable to Obtain URLDetail for source url:" + urlName);
response.setStatus(404);
return;
}
else {
DataInputBuffer inputBuffer = new DataInputBuffer();
inputBuffer.reset(
metadataOut.datumAndMetadataBytes.getBytes(),
metadataOut.datumAndMetadataBytes.getOffset(),
metadataOut.datumAndMetadataBytes.getLength());
CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata();
metadata.readFields(inputBuffer);
if (queryType == URLLinkDetailQueryInfo.QueryType.LINKS_QUERY) {
if (metadata.getMetadata().isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) {
linkdbTS = Long.toString(metadata.getMetadata().getLinkDBTimestamp());
linkdbFileNo = Integer.toString(metadata.getMetadata().getLinkDBFileNo());
linkdbOffset = Long.toString(metadata.getMetadata().getLinkDBOffset());
}
}
else {
if (metadata.getMetadata().isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
linkdbTS = Long.toString(metadata.getMetadata().getInverseDBTimestamp());
linkdbFileNo = Integer.toString(metadata.getMetadata().getInverseDBFileNo());
linkdbOffset = Long.toString(metadata.getMetadata().getInverseDBOffset());
}
}
}
}
if (linkdbTS == null || linkdbFileNo == null || linkdbOffset == null) {
LOG.error("No LinkDB Information found for URL:" + urlName);
response.setStatus(404);
return;
}
queryInfo.setLinkDBTS(Long.parseLong(linkdbTS));
queryInfo.setLinkDBFileNo(Integer.parseInt(linkdbFileNo));
queryInfo.setLinkDBOffset(Long.parseLong(linkdbOffset));
}
else if (queryType == URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_DETAIL_QUERY) {
String urlCount = request.getParameter("urlCount");
String urlDataOffset = request.getParameter("urlDataOffset");
if (urlCount == null || urlDataOffset == null) {
LOG.error("No Inverse Link Metadata Found for:" + urlName);
response.setStatus(404);
return;
}
queryInfo.setInlinkDomainURLCount(Integer.parseInt(urlCount));
queryInfo.setUrlDataOffset(Long.parseLong(urlDataOffset));
}
//LOG.info("URLFP is:" + searchFingerprint.getUrlHash());
// initialize paging info
ClientQueryInfo clientQueryInfo = new ClientQueryInfo();
int page_no = Integer.parseInt(request.getParameter("page_no")) - 1;
int page_size = Integer.parseInt(request.getParameter("page_size"));
String sortBy = request.getParameter("sort_by");
String sortOrder = request.getParameter("sort_order");
if (sortOrder == null) {
sortOrder="ASC";
}
clientQueryInfo.setSortByField(sortBy);
clientQueryInfo.setPageSize(page_size);
clientQueryInfo.setPaginationOffset(page_no);
if (sortOrder.equalsIgnoreCase("ASC"))
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.ASCENDING);
else
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.DESCENDING);
// allocate query object ...
URLLinksQuery query = new URLLinksQuery(queryInfo);
//LOG.info("Starting Blocking URLLinksQuery request");
// and send it through ....
BlockingQueryResult<Writable,Writable> result = _server.blockingQueryRequest(query,clientQueryInfo);
if (result.querySucceeded) {
try {
if (queryType == URLLinkDetailQueryInfo.QueryType.LINKS_QUERY
|| queryType == URLLinkDetailQueryInfo.QueryType.INVERSE_QUERY) {
produceLinksQueryResults(request,response,renderType,result,page_no);
}
else if (queryType == URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_QUERY) {
produceInverseLinksByDomainQueryResults(request,response,renderType,result,page_no);
}
else if (queryType == URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_DETAIL_QUERY) {
produceLinksQueryResults(request,response,renderType,result,page_no);
}
}
catch (IOException e) {
errorResult = CCStringUtils.stringifyException(e);
LOG.error(errorResult);
}
}
else {
LOG.error("Blocking URLLinksQuery request failed");
errorResult = result.errorString;
}
}
catch (Exception e) {
errorResult = CCStringUtils.stringifyException(e);
}
if (errorResult != null) {
LOG.error("LinkDetailQuery failed with error:" + errorResult);
response.sendError(500, errorResult);
}
}
private void produceInverseLinksByDomainDetailQueryResults(
HttpServletRequest request, HttpServletResponse response,
String renderType, BlockingQueryResult<Writable, Writable> result,
int pageNo)throws IOException {
}
private void produceInverseLinksByDomainQueryResults(
HttpServletRequest request, HttpServletResponse response,
String renderType, BlockingQueryResult<Writable, Writable> result,
int page_no)throws IOException {
OutputStream outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
if (renderType != null && renderType.equalsIgnoreCase("x-json")) {
response.setContentType("application/x-json");
}
else {
response.setContentType("text/plain");
}
writer.write("{\"total\":"+result.resultObject.getTotalRecordCount() +",");
writer.write("\"page\":"+ (page_no + 1) +",");
writer.write("\"rows\":[");
int recordCount = 0;
for (QueryResultRecord<Writable,Writable> record : result.resultObject.getResults()) {
if (recordCount++ != 0)
writer.write(",");
writer.write("{ cell:[");
//URL
writer.write(quote(record.getKey().toString())+",");
InlinkingDomainInfo domainInfo = (InlinkingDomainInfo)record.getValue();
writer.write(domainInfo.getUrlCount()+",");
writer.write(Long.toString(domainInfo.getUrlDataPos()));
writer.write("]}\n");
}
writer.write("]}");
writer.flush();
outStream.close();
}
}
private static void produceLinksQueryResults(final HttpServletRequest request, final HttpServletResponse response,String renderType,BlockingQueryResult<Writable,Writable> result,int page_no)throws IOException {
//LOG.info("Blocking URLLinksQuery request succeeded");
OutputStream outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
if (renderType != null && renderType.equalsIgnoreCase("x-json")) {
response.setContentType("application/x-json");
}
else {
response.setContentType("text/plain");
}
writer.write("{\"total\":"+result.resultObject.getTotalRecordCount() +",");
writer.write("\"page\":"+ (page_no + 1) +",");
writer.write("\"rows\":[");
int recordCount = 0;
for (QueryResultRecord<Writable,Writable> record : result.resultObject.getResults()) {
if (recordCount++ != 0)
writer.write(",");
writer.write("{ cell:[");
//URL
writer.write(quote(((CrawlDatumAndMetadata)record.getValue()).getUrl())+",");
float pageRank = ((CrawlDatumAndMetadata)record.getValue()).getMetadata().getPageRank();
// PAGE RANK
writer.print(pageRank);
writer.print(",");
// WRITE STATUS
writer.print(quote(getStatusStringFromMetadata(((CrawlDatumAndMetadata)record.getValue()))));
writer.print(",");
//WRITE DOMAIN NAME
String hostName = URLUtils.fastGetHostFromURL(((CrawlDatumAndMetadata)record.getValue()).getUrl());
writer.print(quote((hostName!=null? hostName : "")));
writer.write("]}\n");
}
writer.write("]}");
writer.flush();
outStream.close();
}
public static String reverseCanonicalizeURL(GoogleURL urlObject)throws MalformedURLException {
StringBuilder urlOut = new StringBuilder();
urlOut.append(urlObject.getScheme());
urlOut.append("://");
if (urlObject.getUserName() != GoogleURL.emptyString) {
urlOut.append(urlObject.getUserName());
if (urlObject.getPassword() != GoogleURL.emptyString) {
urlOut.append(":");
urlOut.append(urlObject.getPassword());
}
urlOut.append("@");
}
String host = urlObject.getHost();
if (host.endsWith(".")) {
host = host.substring(0,host.length() -1);
}
if (!host.startsWith("www.")) {
// ok now. one nasty hack ... :-(
// if root name is null or root name does not equal full host name ...
String rootName = URLUtils.extractRootDomainName(host);
if (rootName != null && rootName.equals(host)) {
// striping the www. prefix
host = "www." + host;
}
}
urlOut.append(host);
if (urlObject.getPort() != GoogleURL.emptyString && !urlObject.getPort().equals("80")) {
urlOut.append(":");
urlOut.append(urlObject.getPort());
}
if (urlObject.getPath() != GoogleURL.emptyString) {
int indexOfSemiColon = urlObject.getPath().indexOf(';');
if (indexOfSemiColon != -1) {
urlOut.append(urlObject.getPath().substring(0,indexOfSemiColon));
}
else {
urlOut.append(urlObject.getPath());
}
}
if (urlObject.getQuery() != GoogleURL.emptyString) {
urlOut.append("?");
urlOut.append(urlObject.getQuery());
}
String canonicalizedURL = urlOut.toString();
return canonicalizedURL;
}
static class HackMetadataOut {
HackMetadataOut(String alternativeURL,MetadataOut metadataOut) {
this.alternativeURL = alternativeURL;
this.metadataOut = metadataOut;
}
String alternativeURL = null;
MetadataOut metadataOut = null;
}
public static HackMetadataOut hackTryAlternativeRouteToGetMetadata(String urlName,URLFPV2 fingerprint)throws IOException {
// ok lookup metadata given index
MetadataOut metadataOut = null;
// try canonical form with leading www stripped :-(
String canonicalURL = URLUtils.canonicalizeURL(urlName, true);
// alternative fp ...
URLFPV2 canonicalFP = URLUtils.getURLFPV2FromCanonicalURL(canonicalURL);
if (canonicalFP != null && canonicalFP.compareTo(fingerprint) != 0) {
// try retrieving metadata from this version
metadataOut = _server.getDatabaseIndex().queryMetadataAndURLGivenFP(canonicalFP);
if (metadataOut != null) {
return new HackMetadataOut(canonicalURL, metadataOut);
}
}
// ok if metadata still is bad ...
if (metadataOut == null) {
GoogleURL urlObject = new GoogleURL(urlName);
String reverseCanonical = reverseCanonicalizeURL(urlObject);
if (reverseCanonical.compareTo(canonicalURL) != 0) {
URLFPV2 reverseCanonicalFP = URLUtils.getURLFPV2FromCanonicalURL(reverseCanonical);
if (reverseCanonicalFP != null && reverseCanonicalFP.compareTo(fingerprint) != 0) {
// try retrieving metadata from this version
metadataOut = _server.getDatabaseIndex().queryMetadataAndURLGivenFP(reverseCanonicalFP);
if (metadataOut != null) {
new HackMetadataOut(reverseCanonical, metadataOut);
}
}
}
}
return null;
}
@SuppressWarnings("serial")
public static class URLContentServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response)throws ServletException, IOException {
LOG.info("Received Request:" +request.toString());
try {
String urlName = request.getParameter("name");
if (urlName == null) {
throw new IOException("name parameter not specified!");
}
// ok parse the url
URLFPV2 fingerprint = URLUtils.getURLFPV2FromURL(urlName);
if (fingerprint == null) {
throw new IOException("Invalid URL:" + urlName);
}
// ok lookup metadata given index
MetadataOut metadataOut = _server.getDatabaseIndex().queryMetadataAndURLGivenFP(fingerprint);
//TODO:HACK - REMOVE LATER!!!
if (metadataOut == null) {
HackMetadataOut hacked = hackTryAlternativeRouteToGetMetadata(urlName,fingerprint);
if (hacked != null) {
metadataOut = hacked.metadataOut;
}
}
if (metadataOut == null || metadataOut.datumAndMetadataBytes.getLength() == 0) {
response.sendError(404);
return;
}
DataInputBuffer inputBuffer = new DataInputBuffer();
inputBuffer.reset(
metadataOut.datumAndMetadataBytes.getBytes(),
metadataOut.datumAndMetadataBytes.getOffset(),
metadataOut.datumAndMetadataBytes.getLength());
CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata();
metadata.readFields(inputBuffer);
if (metadata.getMetadata().getArchiveInfo().size() == 0) {
response.sendError(404);
return;
}
ArchiveInfo infoToUse = null;
for (ArchiveInfo info : metadata.getMetadata().getArchiveInfo()) {
LOG.info("***Found INFO:" + info.getArcfileDate()+"-"+info.getArcfileIndex()+"-"+info.getArcfileOffset());
if (infoToUse == null || infoToUse.getArcfileDate() < info.getArcfileDate()) {
infoToUse = info;
}
}
if (infoToUse == null) {
response.sendError(404);
return;
}
try {
ArcFileItem item = S3Helper.retrieveArcFileItem(infoToUse, _server.getEventLoop());
if (item != null) {
// write out headers
for (ArcFileHeaderItem headerItem : item.getHeaderItems()) {
if (headerItem.getItemKey().length() != 0) {
if (headerItem.getItemKey().equalsIgnoreCase("content-encoding") || headerItem.getItemKey().equalsIgnoreCase("transfer-encoding")
|| headerItem.getItemKey().equalsIgnoreCase("content-length")
) {
LOG.info("*** Skipping Content Encoding Header Field:" + headerItem.getItemValue());
}
else {
response.addHeader(headerItem.getItemKey(),headerItem.getItemValue());
}
}
}
response.addHeader("content-length", Integer.toString(item.getContent().getCount()));
OutputStream outStream;
try {
outStream = response.getOutputStream();
outStream.write(item.getContent().getReadOnlyBytes(),item.getContent().getOffset(),item.getContent().getCount());
/*
writer.println("Item:" + item.getUri());
writer.println("Uncompressed Size:" + item.getContent().getCount());
writer.println("***** Header Data:");
for (ArcFileHeaderItem headerItem : item.getHeaderItems()) {
writer.println("Header Item:" + headerItem.getItemKey() + " Value:" + headerItem.getItemValue());
}
NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(item.getHeaderItems());
CrawlURLMetadata metadataTemp = new CrawlURLMetadata();
HttpHeaderInfoExtractor.parseHeaders(headers, metadataTemp);
String charset = metadataTemp.getCharset();
if (charset.length() !=0 ) {
writer.println("***** Charset(via HttpHeaders):" + charset);
}
else {
charset = CharsetUtils.sniffCharacterEncoding(item.getContent().getReadOnlyBytes());
if (charset != null) {
writer.println("***** Charset(via HTML MetaTag):" + charset);
}
}
if (charset == null || charset.length() == 0) {
charset = "ASCII";
writer.println("***** Charset(NotFount-UsingDefault):ASCII");
}
Charset charsetObj = Charset.forName(charset);
if (charsetObj == null) {
writer.println("***** Could Not Create CharsetDecoder for charset:" + charset);
LOG.info("Unable to create Charsetcharset. Using ASCII");
charsetObj = Charset.forName("ASCII");
}
writer.println("***** Content:");
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(item.getContent().getReadOnlyBytes(),0,item.getContent().getCount()),charsetObj));
try {
String line = null;
while ((line = bufferedReader.readLine()) != null) {
writer.println(line);
}
}
finally {
bufferedReader.close();
}
writer.flush();
*/
outStream.flush();
outStream.close();
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw e;
}
}
else {
response.sendError(404);
return;
}
} catch (IOException e) {
LOG.error("Query Failed with Exception:" + CCStringUtils.stringifyException(e));
throw e;
}
}
catch (Exception e) {
throw new IOException(CCStringUtils.stringifyException(e));
}
}
}
private static long findLatestDatabaseTimestamp(Path rootPath)throws IOException {
FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
FileStatus candidates[] = fs.globStatus(new Path(rootPath,"*"));
long candidateTimestamp = -1L;
for (FileStatus candidate : candidates) {
LOG.info("Found Seed Candidate:" + candidate.getPath());
long timestamp = Long.parseLong(candidate.getPath().getName());
if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
candidateTimestamp = timestamp;
}
}
LOG.info("Selected Candidate is:"+ candidateTimestamp);
return candidateTimestamp;
}
@SuppressWarnings("serial")
public static class CrawlListServlet extends HttpServlet {
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response)throws ServletException, IOException {
// hack
String crawlerNames[] = {
"ccc01-01",
"ccc01-02",
"ccc02-01",
"ccc02-02",
"ccc03-01",
"ccc03-02",
"ccc04-01",
"ccc04-02"
};
LOG.info("Received Request:" +request.toString());
try {
String domainName = request.getParameter("name");
if (domainName == null) {
throw new IOException("name parameter not specified!");
}
else {
domainName = URLDecoder.decode(domainName,"UTF-8");
}
// compute fingerprint for given url
long domainId = _server.getDatabaseIndex().queryDomainIdGivenDomain(domainName);
// figure out shard id ...
//TODO: FIX HACKED SHARD EXTRACTION
int shardId = (MurmurHash.hashLong(domainId,1) & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS;
int crawlerIndex = Math.abs((int)(domainId % crawlerNames.length));
// default generator path
Path generatorPath = new Path("crawl/generator/prgenerator");
//get latest timestamp
long timestamp = findLatestDatabaseTimestamp(generatorPath);
if (timestamp == -1) {
throw new IOException("Timestamp Not Found!");
}
// build path to sharded data ...
Path shardedDataLocation = new Path("crawl/generator/prgenerator/" + timestamp + "/analysis/part-" + NUMBER_FORMAT.format(shardId));
LOG.info("Looking Up CrawlAnalysis File At:" + shardedDataLocation);
response.setContentType("text/plain");
PrintWriter writer = new PrintWriter(response.getWriter());
URLFPV2 queryFP = new URLFPV2();
queryFP.setDomainHash(domainId);
// ok open file
FSDataInputStream indexInputStream = CrawlEnvironment.getDefaultFileSystem().open(shardedDataLocation);
try {
TFile.Reader reader = new TFile.Reader(indexInputStream,CrawlEnvironment.getDefaultFileSystem().getFileStatus(shardedDataLocation).getLen(),CrawlEnvironment.getHadoopConfig());
try {
TFile.Reader.Scanner scanner = reader.createScanner();
try {
DataOutputBuffer keyBuffer = new DataOutputBuffer();
keyBuffer.writeLong(domainId);
if (scanner.seekTo(keyBuffer.getData(),0,keyBuffer.getLength())) {
DataInputStream valueStream = scanner.entry().getValueStream();
TreeMap<Integer,MetadataOut> sortedMap = new TreeMap<Integer,MetadataOut>();
while (valueStream.available() != 0) {
int position = WritableUtils.readVInt(valueStream);
long urlHash = valueStream.readLong();
queryFP.setUrlHash(urlHash);
MetadataOut metadata = _server.getDatabaseIndex().queryMetadataAndURLGivenFP(queryFP);
sortedMap.put(position, metadata);
}
for (Map.Entry<Integer,MetadataOut> entry : sortedMap.entrySet()) {
writer.print("Queue: [" + crawlerNames[crawlerIndex] + "]POS:" + entry.getKey());
writer.print(" ,URL:" + entry.getValue().url.toString());
writer.print(" ,PageRank:" + entry.getValue().pageRank);
writer.print(" ,FetchStatus:" + CrawlDatum.getStatusName(entry.getValue().fetchStatus));
writer.print("\n");
}
}
else {
writer.print("ERROR:Unable to Locate Data for Domain:" + domainName + " DH:" + domainId + "\n");
}
}
finally {
scanner.close();
}
}
finally {
reader.close();
}
}
finally {
indexInputStream.close();
writer.flush();
}
}
catch (Exception e) {
throw new IOException(CCStringUtils.stringifyException(e));
}
}
}
@SuppressWarnings("serial")
public static class InverseURLListByRootDomainQueryServlet extends HttpServlet {
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response)throws ServletException, IOException {
//LOG.info("Received Request:" +request.toString());
if (request.getParameter("domain") == null || request.getParameter("page_no") == null ||
request.getParameter("page_size") == null && request.getParameter("sort_order") == null) {
throw new IOException("Missing Required Parameters");
}
try {
String domain = request.getParameter("domain");
int page_no = Integer.parseInt(request.getParameter("page_no")) - 1;
int page_size = Integer.parseInt(request.getParameter("page_size"));
String sortOrder = request.getParameter("sort_order");
// build domain query info
InlinksByDomainQueryInfo queryInfo = new InlinksByDomainQueryInfo();
if (URLUtils.isValidDomainName(domain)) {
String rootDomain = URLUtils.extractRootDomainName(domain);
if (rootDomain != null) {
// set search pattern parameter
queryInfo.setDomainName(rootDomain);
}
}
if (queryInfo.getDomainName().length() == 0) {
throw new IOException("Invalid Domain Name:" + domain);
}
// initialize paging info
ClientQueryInfo clientQueryInfo = new ClientQueryInfo();
clientQueryInfo.setPageSize(page_size);
clientQueryInfo.setPaginationOffset(page_no);
if (sortOrder.equalsIgnoreCase("ASC"))
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.ASCENDING);
else
clientQueryInfo.setSortOrder(ClientQueryInfo.SortOrder.DESCENDING);
InverseLinksByDomainQuery query = new InverseLinksByDomainQuery(queryInfo);
try {
BlockingQueryResult<FlexBuffer, URLFPV2> result = _server.blockingQueryRequest(query, clientQueryInfo);
if (result.querySucceeded) {
OutputStream outStream;
try {
outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
//response.setContentType("application/x-json");
response.setContentType("text/plain");
writer.write("{\"total\":"+result.resultObject.getTotalRecordCount() +",");
writer.write("\"page\":"+ (page_no + 1) +",");
writer.write("\"rows\":[");
int count = 0;
DataInputBuffer inputReader = new DataInputBuffer();
TextBytes text = new TextBytes();
for (QueryResultRecord<FlexBuffer, URLFPV2> record : result.resultObject.getResults()) {
if (count++ != 0)
writer.write(",");
// initialize the stream reader
inputReader.reset(record.getKey().get(),record.getKey().getOffset(),record.getKey().getCount());
// skip target fp
inputReader.readLong();
// capture rank
float pageRank = inputReader.readFloat();
// capture incoming url ...
int textSize = WritableUtils.readVInt(inputReader);
// initialize text
text.set(record.getKey().get(),inputReader.getPosition(),textSize);
writer.write("[");
writer.print(quote(text.toString()));
writer.print(',');
writer.print(pageRank);
writer.print(',');
MetadataOut metadata = null;
try {
metadata = _server.getDatabaseIndex().queryMetadataAndURLGivenFP(record.getValue());
}
catch (IOException e ) {
LOG.error(CCStringUtils.stringifyException(e));
}
if (metadata != null) {
writer.print(quote(metadata.url.toString()));
}
else {
writer.print(quote("<<BAD URL>>"));
}
writer.write("]\n");
}
writer.append("]}");
writer.flush();
outStream.close();
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
else {
OutputStream outStream;
try {
outStream = response.getOutputStream();
PrintWriter writer = new PrintWriter(outStream);
response.setContentType("text/plain");
writer.append("Query Failed with Error:\n");
writer.append(result.errorString);
writer.flush();
outStream.close();
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
} catch (IOException e) {
LOG.error("Query Failed with Exception:" + CCStringUtils.stringifyException(e));
throw e;
}
}
catch (Exception e) {
throw new IOException(CCStringUtils.stringifyException(e));
}
}
}
/**
* Produce a string in double quotes with backslash sequences in all the
* right places. A backslash will be inserted within </, allowing JSON
* text to be delivered in HTML. In JSON text, a string cannot contain a
* control character or an unescaped quote or backslash.
* @param string A String
* @return A String correctly formatted for insertion in a JSON text.
*/
public static String quote(String string) {
if (string == null || string.length() == 0) {
return "\"\"";
}
char b;
char c = 0;
int i;
int len = string.length();
StringBuffer sb = new StringBuffer(len + 4);
String t;
sb.append('"');
for (i = 0; i < len; i += 1) {
b = c;
c = string.charAt(i);
switch (c) {
case '\\':
case '"':
sb.append('\\');
sb.append(c);
break;
case '/':
if (b == '<') {
sb.append('\\');
}
sb.append(c);
break;
case '\b':
sb.append("\\b");
break;
case '\t':
sb.append("\\t");
break;
case '\n':
sb.append("\\n");
break;
case '\f':
sb.append("\\f");
break;
case '\r':
sb.append("\\r");
break;
default:
if (c < ' ' || (c >= '\u0080' && c < '\u00a0') ||
(c >= '\u2000' && c < '\u2100')) {
t = "000" + Integer.toHexString(c);
sb.append("\\u" + t.substring(t.length() - 4));
} else {
sb.append(c);
}
}
}
sb.append('"');
return sb.toString();
}
}