/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.IOException;
import java.io.PrintWriter;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.concurrent.Semaphore;
import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.SecretKeySpec;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.server.AsyncWebServerRequest;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.service.listcrawler.CrawlListDatabaseRecord;
import org.commoncrawl.service.listcrawler.CrawlListDomainItem;
import org.commoncrawl.service.listcrawler.CrawlListMetadata;
import org.commoncrawl.service.listcrawler.CrawlList.QueueState;
import org.commoncrawl.util.CCStringUtils;
import com.google.gson.stream.JsonWriter;
@SuppressWarnings("serial")
/**
* Servlet used to support the crawl lists ui
*
* @author rana
*
*/
public class CrawlListsUI extends HttpServlet {
public static final Log LOG = LogFactory.getLog(CrawlListsUI.class);
static String salt = "#$@!1Z";
static byte secretKey[] = {(byte)0xcd,(byte)0xe7,(byte)0xe9,(byte)0x9d,(byte)0xb4,(byte)0x84,(byte)0xc5,0x2f,0x49,(byte)0xee,0x16,(byte)0xb1,0x12,(byte)0xa6,(byte)0xef,(byte)0xb7};
public static String decryptUserKey(String userKey) {
if (userKey.length() % 2 != 0) {
return null;
}
byte keyAsHex[] = hexStringToByteArray(userKey);
if (keyAsHex != null) {
SecretKeySpec skeySpec = new SecretKeySpec(secretKey, "AES");
try {
Cipher cipher = Cipher.getInstance("AES");
cipher.init(Cipher.DECRYPT_MODE, skeySpec);
byte[] original = cipher.doFinal(keyAsHex);
String originalString = new String(original);
if (originalString.startsWith(salt)) {
return originalString.substring(salt.length());
}
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (NoSuchPaddingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidKeyException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IllegalBlockSizeException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (BadPaddingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return null;
}
public static byte[] hexStringToByteArray(String s) {
int len = s.length();
byte[] data = new byte[len / 2];
for (int i = 0; i < len; i += 2) {
data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4)
+ Character.digit(s.charAt(i+1), 16));
}
return data;
}
public static class HttpResult {
public int _resultCode = HttpServletResponse.SC_OK;
public String _resultDesc = "";
}
@Override
protected void doGet(final HttpServletRequest req, final HttpServletResponse resp)throws ServletException, IOException {
String listIdStr = req.getParameter("listId");
String reqType = req.getParameter("reqType");
HttpResult result = new HttpResult();
result._resultCode = HttpServletResponse.SC_BAD_REQUEST;
resp.setContentType("application/json");
if (listIdStr != null && reqType != null) {
long listId = Long.parseLong(listIdStr);
if (reqType.equals("listLists")) {
String encCustomerId = req.getParameter("customerId");
String customerId = decryptUserKey(encCustomerId);
if (customerId != null) {
getListsForCustomer(customerId,resp,result);
}
}
else if (reqType.equals("subDomainCount")) {
String encCustomerId = req.getParameter("customerId");
String customerId = decryptUserKey(encCustomerId);
if (customerId != null) {
getSubDomainCount(customerId,listId,resp,result);
}
}
else if (reqType.equals("subDomainList")) {
String offset = req.getParameter("offset");
String count = req.getParameter("count");
String encCustomerId = req.getParameter("customerId");
String customerId = decryptUserKey(encCustomerId);
if (offset != null && count != null && customerId != null) {
getDomainListForListId(customerId,listId, Integer.parseInt(offset),Integer.parseInt(count),resp,result);
}
}
else if (reqType.equals("listDetails")) {
String encCustomerId = req.getParameter("customerId");
String customerId = decryptUserKey(encCustomerId);
if (customerId != null) {
getListDetails(customerId,listId,resp,result);
}
}
else if (reqType.equals("domainDetails")) {
String domainId = req.getParameter("domainId");
String encCustomerId = req.getParameter("customerId");
String customerId = decryptUserKey(encCustomerId);
if (domainId != null && customerId != null) {
getDomainDetailForDomain(customerId,listId,domainId,resp,result);
}
}
}
if (result._resultCode != HttpServletResponse.SC_OK) {
resp.sendError(result._resultCode, result._resultDesc);
}
}
private static CrawlListDomainItem buildListSummary(CrawlListMetadata metadata) {
CrawlListDomainItem domainItem = new CrawlListDomainItem();
int robotsExcludedItemsCount =0;
int errorItemsCount =0;
int inCacheItems = 0;
int processedItemsCount = 0;
int http200Count = 0;
http200Count += metadata.getHttp200Count();
robotsExcludedItemsCount += metadata.getRobotsExcludedCount();
errorItemsCount += metadata.getTimeoutErrorCount();
errorItemsCount += metadata.getIOExceptionCount();
errorItemsCount += metadata.getDNSErrorCount();
errorItemsCount += metadata.getOtherErrorCount();
processedItemsCount += metadata.getHttp200Count();
processedItemsCount += metadata.getHttp403Count();
processedItemsCount += metadata.getHttp404Count();
processedItemsCount += metadata.getHttp500Count();
processedItemsCount += metadata.getHttpOtherCount();
domainItem.setUrlCount(metadata.getUrlCount());
domainItem.setUrlsCrawled(processedItemsCount);
domainItem.setHttp200Count(http200Count);
domainItem.setInCacheItemsCount(0);
domainItem.setRobotsExcludedCount(robotsExcludedItemsCount);
domainItem.setErrorCount(errorItemsCount);
domainItem.setQueuedCount(metadata.getQueuedItemCount());
return domainItem;
}
public static void getDomainDetailForDomain(final String customerId,final long listId,final String domainName,final HttpServletResponse resp,final HttpResult result)throws IOException {
final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();
server.dispatchAsyncWebRequest(new AsyncWebServerRequest("",resp.getWriter()) {
@Override
public boolean handleRequest(Semaphore completionSemaphore) throws IOException {
ProxyServer proxyServer = (ProxyServer)server;
if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
resp.sendError(HttpServletResponse.SC_FORBIDDEN);
}
else {
CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);
if (list != null && list.isListLoaded()) {
CrawlListMetadata metadata = list.getSubDomainMetadataByDomain(domainName);
if (metadata != null) {
CrawlListDomainItem item = buildListSummary(metadata);
PrintWriter writer = resp.getWriter();
JsonWriter jsonWriter = new JsonWriter(writer);
try {
jsonWriter.beginObject();
jsonWriter.name("items");
jsonWriter.beginArray();
if (item.getHttp200Count() != 0)
jsonWriter.beginArray().value("http200").value(item.getHttp200Count()).endArray();
int http403Count = metadata.getHttp403Count() + metadata.getRedirectHttp403Count();
if (http403Count != 0)
jsonWriter.beginArray().value("http403").value(http403Count).endArray();
int http404Count = metadata.getHttp404Count() + metadata.getRedirectHttp404Count();
if (http404Count != 0)
jsonWriter.beginArray().value("http404").value(http404Count).endArray();
int http500Count = metadata.getHttp500Count() + metadata.getRedirectHttp500Count();
if (http500Count != 0)
jsonWriter.beginArray().value("http500").value(http500Count).endArray();
int httpOtherCount = metadata.getHttpOtherCount() + metadata.getRedirectHttpOtherCount();
if (httpOtherCount != 0)
jsonWriter.beginArray().value("httpOther").value(httpOtherCount).endArray();
if (item.getInCacheItemsCount() != 0)
jsonWriter.beginArray().value("inCache").value(item.getInCacheItemsCount()).endArray();
if (item.getRobotsExcludedCount() != 0)
jsonWriter.beginArray().value("robotsExcluded").value(item.getRobotsExcludedCount()).endArray();
// caculate errors
int timeoutErrorCount = metadata.getTimeoutErrorCount() + metadata.getRedirectTimeoutErrorCount();
int ioexceptionErrorCount = metadata.getIOExceptionCount() + metadata.getRedirectIOExceptionCount();
int otherErrorCount = metadata.getOtherErrorCount();
if (timeoutErrorCount != 0)
jsonWriter.beginArray().value("timeouts").value(timeoutErrorCount).endArray();
if (ioexceptionErrorCount != 0)
jsonWriter.beginArray().value("exceptions").value(ioexceptionErrorCount).endArray();
/*
if (otherErrorCount != 0)
jsonWriter.beginArray().value("other errors").value(otherErrorCount).endArray();
*/
// calculate remaining items
int remainingItems = metadata.getUrlCount();
// take off http counts
remainingItems -= item.getHttp200Count();
remainingItems -= http403Count;
remainingItems -= http404Count;
remainingItems -= http500Count;
remainingItems -= httpOtherCount;
remainingItems -= item.getRobotsExcludedCount();
remainingItems -= (timeoutErrorCount + ioexceptionErrorCount);
if (remainingItems > 0) {
jsonWriter.beginArray().value("remaining").value(remainingItems).endArray();
}
jsonWriter.endArray();
jsonWriter.endObject();
result._resultCode = HttpServletResponse.SC_OK;
} catch (Exception e) {
throw new IOException(e);
}
}
}
}
return false;
}
});
}
private static void getListsForCustomer(final String customerId,final HttpServletResponse resp,final HttpResult result) throws IOException {
final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();
server.dispatchAsyncWebRequest(new AsyncWebServerRequest("",resp.getWriter()) {
@Override
public boolean handleRequest(final Semaphore completionSemaphore)throws IOException {
final ProxyServer proxyServer = (ProxyServer)server;
LOG.info("Getting List for Customer:" + customerId);
final Collection<CrawlListDatabaseRecord> recordSet = proxyServer.getListInfoForCustomerId(customerId).values();
final ArrayList<CrawlListDatabaseRecord> sortedSet = new ArrayList<CrawlListDatabaseRecord>();
sortedSet.addAll(recordSet);
// sort by timestamp
Collections.sort(sortedSet,new Comparator<CrawlListDatabaseRecord>() {
@Override
public int compare(CrawlListDatabaseRecord o1,CrawlListDatabaseRecord o2) {
return (o1.getListId() > o2.getListId()) ? -1 : 1;
}
});
LOG.info("Found:" + sortedSet.size() + " Lists for Customer:" + customerId);
if (sortedSet.size() != 0) {
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
LOG.info("Running Worker Thread");
try {
PrintWriter writer = resp.getWriter();
JsonWriter jsonWriter = new JsonWriter(writer);
jsonWriter.beginObject();
jsonWriter.name("items");
jsonWriter.beginArray();
for (CrawlListDatabaseRecord listRecord : sortedSet) {
// get the list
CrawlList list = proxyServer.getCrawlHistoryManager().getList(listRecord.getListId());
if (list == null) {
LOG.error("DID NOT Find List Object for List:" + listRecord.getListId() + " Name:" + listRecord.getListName() + " FileName:" + listRecord.getSourceFileName() + " TempFile:" + listRecord.getTempFileName());
}
if (list != null) {
String queueState = "W";
if (list.isListLoaded()) {
if (list.getQueuedState() == QueueState.QUEUEING)
queueState = "Q";
else if (list.getQueuedState() == QueueState.QUEUED)
queueState = "L";
else if (list.getQueuedState() == QueueState.ERROR)
queueState = "L";
else
queueState = "?";
CrawlListMetadata metadata = list.getMetadata();
CrawlListDomainItem summary = buildListSummary(metadata);
// populate identification info
summary.setListId(list.getListId());
summary.setListName(listRecord.getListName());
jsonWriter.beginArray();
jsonWriter.value(summary.getListId());
jsonWriter.value(queueState);
jsonWriter.value(summary.getListName());
jsonWriter.value(list.getSubDomainItemCount());
jsonWriter.value(summary.getUrlCount());
jsonWriter.value(summary.getUrlsCrawled());
jsonWriter.value(summary.getHttp200Count());
jsonWriter.value(summary.getRobotsExcludedCount());
jsonWriter.value(summary.getErrorCount());
jsonWriter.value(summary.getQueuedCount());
jsonWriter.endArray();
}
else if (list.getLoadState() == CrawlList.LoadState.QUEUED_FOR_LOADING) {
jsonWriter.beginArray();
jsonWriter.value(list.getListId());
jsonWriter.value(queueState);
jsonWriter.value("<B>Queued:</B>" + listRecord.getListName());
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.endArray();
}
else if (list.getLoadState() == CrawlList.LoadState.REALLY_LOADING) {
jsonWriter.beginArray();
jsonWriter.value(list.getListId());
jsonWriter.value("<B>Loading:</B>" + listRecord.getListName());
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.endArray();
}
else if (list.getLoadState() == CrawlList.LoadState.ERROR) {
jsonWriter.beginArray();
jsonWriter.value(list.getListId());
jsonWriter.value("ERR");
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.value(0);
jsonWriter.endArray();
}
}
}
jsonWriter.endArray();
jsonWriter.endObject();
LOG.info("Done");
result._resultCode = HttpServletResponse.SC_OK;
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
} catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
finally {
LOG.error("DONE");
completionSemaphore.release();
}
}
});
LOG.info("Spawning Worker Thread");
thread.start();
return true;
}
return false;
}
});
}
public static void getListDetails(final String customerId,final long listId,final HttpServletResponse resp,final HttpResult result) throws IOException {
final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();
server.dispatchAsyncWebRequest(new AsyncWebServerRequest("",resp.getWriter()) {
@Override
public boolean handleRequest(Semaphore completionSemaphore) throws IOException {
ProxyServer proxyServer = (ProxyServer)server;
if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
resp.sendError(HttpServletResponse.SC_FORBIDDEN);
}
else {
CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);
if (list != null && list.isListLoaded()) {
CrawlListMetadata metadata = list.getMetadata();
CrawlListDomainItem item = buildListSummary(metadata);
PrintWriter writer = resp.getWriter();
writer.println(
"{ "
+ "total:"+item.getUrlCount() +","
+ "crawled:"+item.getUrlsCrawled() +","
+ "http200:"+item.getHttp200Count() +","
+ "inCache:"+item.getInCacheItemsCount() +","
+ "robotsExcluded:"+item.getRobotsExcludedCount() +","
+ "error:"+item.getErrorCount()
+ "queued:" + item.getQueuedCount() + "}");
result._resultCode = HttpServletResponse.SC_OK;
}
}
return false;
}
});
}
public static void getSubDomainCount(final String customerId,final long listId,final HttpServletResponse resp,final HttpResult result) throws IOException {
final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();
server.dispatchAsyncWebRequest(new AsyncWebServerRequest("",resp.getWriter()) {
@Override
public boolean handleRequest(Semaphore completionSemaphore) throws IOException {
ProxyServer proxyServer = (ProxyServer)server;
if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
resp.sendError(HttpServletResponse.SC_FORBIDDEN);
}
else {
CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);
if (list != null && list.isListLoaded()) {
PrintWriter writer = resp.getWriter();
writer.println(
"{ "
+ "itemCount:" + list.getSubDomainItemCount()
+ "}");
result._resultCode = HttpServletResponse.SC_OK;
}
}
return false;
}
});
}
public static void getDomainListForListId(final String customerId,final long listId,final int offset,final int count,final HttpServletResponse resp,final HttpResult result) throws IOException {
final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();
server.dispatchAsyncWebRequest(new AsyncWebServerRequest("",resp.getWriter()) {
@Override
public boolean handleRequest(Semaphore completionSemaphore) throws IOException {
ProxyServer proxyServer = (ProxyServer)server;
if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
resp.sendError(HttpServletResponse.SC_FORBIDDEN);
}
else {
CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);
if (list != null && list.isListLoaded()) {
PrintWriter writer = resp.getWriter();
JsonWriter jsonWriter= new JsonWriter(writer);
try {
jsonWriter.beginObject();
jsonWriter.name("items");
jsonWriter.beginArray();
int urlCount = 0;
for (CrawlListDomainItem item : list.getSubDomainList(offset,count)) {
jsonWriter.beginArray();
jsonWriter.value(item.getDomainName());
jsonWriter.value(item.getUrlCount());
urlCount += item.getUrlCount();
jsonWriter.value(item.getUrlsCrawled());
jsonWriter.value(item.getQueuedCount());
jsonWriter.value(item.getHashCode());
jsonWriter.endArray();
}
jsonWriter.endArray();
jsonWriter.name("remainingItems").value(list.getMetadata().getUrlCount() - urlCount);
jsonWriter.endObject();
result._resultCode = HttpServletResponse.SC_OK;
}
catch (Exception e) {
throw new IOException(e);
}
}
else {
resp.getWriter().print("Crawl List NULL!!");
}
}
return false;
}
});
}
}