package org.commoncrawl.service.statscollector; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSortedMap; import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import com.google.common.collect.SortedSetMultimap; import com.google.common.collect.TreeMultimap; import com.google.visualization.datasource.Capabilities; import com.google.visualization.datasource.DataSourceHelper; import com.google.visualization.datasource.DataSourceRequest; import com.google.visualization.datasource.QueryPair; import com.google.visualization.datasource.base.DataSourceException; import com.google.visualization.datasource.base.ReasonType; import com.google.visualization.datasource.base.ResponseStatus; import com.google.visualization.datasource.base.StatusType; import com.google.visualization.datasource.base.TypeMismatchException; import com.google.visualization.datasource.datatable.ColumnDescription; import com.google.visualization.datasource.datatable.DataTable; import com.google.visualization.datasource.datatable.TableCell; import com.google.visualization.datasource.datatable.TableRow; import com.google.visualization.datasource.datatable.value.DateTimeValue; import com.google.visualization.datasource.datatable.value.DateValue; import com.google.visualization.datasource.datatable.value.TimeOfDayValue; import com.google.visualization.datasource.datatable.value.Value; import com.google.visualization.datasource.datatable.value.ValueType; import com.google.visualization.datasource.query.AbstractColumn; import com.google.visualization.datasource.query.Query; import com.ibm.icu.util.GregorianCalendar; import com.ibm.icu.util.TimeZone; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.async.CallbackWithResult; import org.commoncrawl.async.Timer; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.statscollector.CrawlerStats; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.RPCStructIntrospector; import org.commoncrawl.util.time.Day; import org.commoncrawl.util.time.Hour; import java.io.IOException; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.Map.Entry; import java.util.concurrent.Semaphore; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; public class CrawlerStatsQuery extends HttpServlet { /** * The log used throughout the data source library. */ private static final Log LOG = LogFactory.getLog(CrawlerStatsQuery.class.getName()); private static RPCStructIntrospector _crawlStatsHelper = new RPCStructIntrospector(CrawlerStats.class); private static final int HourlyTimestampColumnIndex = 1; private static final int DailyTimestampColumnIndex = 2; private static final ColumnDescription[] URLSTATS_TABLE_COLUMNS = new ColumnDescription[] { new ColumnDescription("timeframe", ValueType.TEXT, "timeframe"), new ColumnDescription("tshourly", ValueType.DATETIME, "datetime"), new ColumnDescription("tsdaily", ValueType.DATE, "datetime"), new ColumnDescription("queued",ValueType.NUMBER,"queued"), new ColumnDescription("loading",ValueType.NUMBER,"loading"), new ColumnDescription("crawled",ValueType.NUMBER,"crawled"), new ColumnDescription("success",ValueType.NUMBER,"success"), new ColumnDescription("failed",ValueType.NUMBER,"failed"), new ColumnDescription("HTTP200",ValueType.NUMBER,"(20x)"), new ColumnDescription("HTTP300",ValueType.NUMBER,"(30x)"), new ColumnDescription("HTTP301",ValueType.NUMBER,"(301)"), new ColumnDescription("HTTP302",ValueType.NUMBER,"(302)"), new ColumnDescription("HTTP304",ValueType.NUMBER,"(304)"), new ColumnDescription("HTTP400",ValueType.NUMBER,"(40x)"), new ColumnDescription("HTTP403",ValueType.NUMBER,"(403)"), new ColumnDescription("HTTP404",ValueType.NUMBER,"(404)"), new ColumnDescription("HTTP500",ValueType.NUMBER,"(50x)"), new ColumnDescription("HTTPOther",ValueType.NUMBER,"Other"), new ColumnDescription("ERRUNKNOWN",ValueType.NUMBER,"Uknown Error"), new ColumnDescription("ERRUnknownProtocol",ValueType.NUMBER,"Unk Proto"), new ColumnDescription("ERRMalformedURL",ValueType.NUMBER,"Malform URL"), new ColumnDescription("ERRTimeout",ValueType.NUMBER,"Timeout"), new ColumnDescription("ERRDNSFailure",ValueType.NUMBER,"DNS Failed"), new ColumnDescription("ERRResolverFailure",ValueType.NUMBER,"Resolver Fail"), new ColumnDescription("ERRIOException",ValueType.NUMBER,"IO Excep"), new ColumnDescription("ERRRobotsExcluded",ValueType.NUMBER,"Robots Excl"), new ColumnDescription("ERRNoData",ValueType.NUMBER,"No Data"), new ColumnDescription("ERRRobotsParseError",ValueType.NUMBER,"Robots Parse Err"), new ColumnDescription("ERRRedirectFailed",ValueType.NUMBER,"Redirect Fail"), new ColumnDescription("ERRRuntimeError",ValueType.NUMBER,"Runtime Err"), new ColumnDescription("ERRConnectTimeout",ValueType.NUMBER,"Connect Timeout"), new ColumnDescription("ERRBlackListedHost",ValueType.NUMBER,"BlackListed Host"), new ColumnDescription("ERRBlackListedURL",ValueType.NUMBER,"BlackListed URL"), new ColumnDescription("ERRTooManyErrors",ValueType.NUMBER,"TooMany Errors"), new ColumnDescription("ERRInCache",ValueType.NUMBER,"InCache"), new ColumnDescription("ERRInvalidResponseCode",ValueType.NUMBER,"Invalid Resp"), new ColumnDescription("ERRBadRedirectData",ValueType.NUMBER,"Bad Redirect"), new ColumnDescription("HTTP301_1HOP",ValueType.NUMBER,"HTTP301-ONE-HOP"), new ColumnDescription("HTTP301_2HOPS",ValueType.NUMBER,"HTTP301-TWO-HOPS"), new ColumnDescription("HTTP301_3HOPS",ValueType.NUMBER,"HTTP301-THREE-HOPS"), new ColumnDescription("HTTP301_GT3HOPS",ValueType.NUMBER,"HTTP301-GT_3-HOPS"), }; private static ImmutableMap<String,String> columnNameToFieldNameMap = new ImmutableMap.Builder() .put("queued","urlsInFetcherQueue") .put("loading","urlsInLoaderQueue") .put("crawled","urlsProcessed") .put("success","urlsSucceeded") .put("failed","urlsFailed") .put("HTTP200","http200Count") .put("HTTP300","http300Count") .put("HTTP301","http301Count") .put("HTTP302","http302Count") .put("HTTP304","http304Count") .put("HTTP400","http400Count") .put("HTTP403","http403Count") .put("HTTP404","http404Count") .put("HTTP500","http500Count") .put("HTTPOther","httpOtherCount") .put("ERRUNKNOWN","httpErrorUNKNOWN") .put("ERRUnknownProtocol","httpErrorUnknownProtocol") .put("ERRMalformedURL","httpErrorMalformedURL") .put("ERRTimeout","httpErrorTimeout") .put("ERRDNSFailure","httpErrorDNSFailure") .put("ERRResolverFailure","httpErrorResolverFailure") .put("ERRIOException","httpErrorIOException") .put("ERRRobotsExcluded","httpErrorRobotsExcluded") .put("ERRNoData","httpErrorNoData") .put("ERRRobotsParseError","httpErrorRobotsParseError") .put("ERRRedirectFailed","httpErrorRedirectFailed") .put("ERRRuntimeError","httpErrorRuntimeError") .put("ERRConnectTimeout","httpErrorConnectTimeout") .put("ERRBlackListedHost","httpErrorBlackListedHost") .put("ERRBlackListedURL","httpErrorBlackListedURL") .put("ERRTooManyErrors","httpErrorTooManyErrors") .put("ERRInCache","httpErrorInCache") .put("ERRInvalidResponseCode","httpErrorInvalidResponseCode") .put("ERRBadRedirectData","httpErrorBadRedirectData") .put("HTTP301_1HOP","redirectResultAfter1Hops") .put("HTTP301_2HOPS","redirectResultAfter2Hops") .put("HTTP301_3HOPS","redirectResultAfter3Hops") .put("HTTP301_GT3HOPS","redirectResultAfterGT3Hops") .build(); private static ImmutableSet<String> tablesWithCrawlerStats = new ImmutableSet.Builder() .add("urlspersec") .add("mbytes") .add("downloadsz") .build(); private static ImmutableSet<String> isAverageStatTable = new ImmutableSet.Builder() .add("downloadsz") .build(); private static ImmutableMap<String,String> crawlerStatTableToField = new ImmutableMap.Builder() .put("urlspersec", "urlsPerSecond") .put("mbytes", "mbytesDownPerSecond") .put("downloadsz", "averageDownloadSize") .build(); private static ImmutableSet<String> excludedCrawlers = new ImmutableSet.Builder() .add("ccn01-PROXY-Dbg") .add("ccn01-PROXY-Prod") .build(); @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException { LOG.info("Got Request:" + req.toString()); DataSourceRequest dsRequest = null; try { // Extract the request parameters. dsRequest = new DataSourceRequest(req); // NOTE: If you want to work in restricted mode, which means that only // requests from the same domain can access the data source, you should // uncomment the following call. // // DataSourceHelper.verifyAccessApproved(dsRequest); // Split the query. QueryPair query = DataSourceHelper.splitQuery(dsRequest.getQuery(), Capabilities.SELECT); // Generate the data table. DataTable data = generateMyDataTable(query.getDataSourceQuery(), req); // Apply the completion query to the data table. DataTable newData = DataSourceHelper.applyQuery(query.getCompletionQuery(), data, dsRequest.getUserLocale()); newData.setCustomProperty("tableid", req.getParameter("tableId")); DataSourceHelper.setServletResponse(newData, dsRequest, resp); } catch (RuntimeException rte) { LOG.error("A runtime exception has occured", rte); ResponseStatus status = new ResponseStatus(StatusType.ERROR, ReasonType.INTERNAL_ERROR, rte.getMessage()); if (dsRequest == null) { dsRequest = DataSourceRequest.getDefaultDataSourceRequest(req); } DataSourceHelper.setServletErrorResponse(status, dsRequest, resp); } catch (DataSourceException e) { if (dsRequest != null) { DataSourceHelper.setServletErrorResponse(e, dsRequest, resp); } else { DataSourceHelper.setServletErrorResponse(e, req, resp); } } } /** * Returns true if the given column name is requested in the given query. * If the query is empty, all columnNames returns true. * * @param query The given query. * @param columnName The requested column name. * * @return True if the given column name is requested in the given query. */ private boolean isColumnRequested(Query query, String columnName) { // If the query is empty return true. if (query.isEmpty()) { return true; } List<AbstractColumn> columns = query.getSelection().getColumns(); for (AbstractColumn column : columns) { if (column.getId().equalsIgnoreCase(columnName)) { return true; } } return false; } /** * Generates a data table - according to the provided tableId url parameter. * * @param query The query to operate on the underlying data. * @param req The HttpServeltRequest. * * @return The generated data table. */ private DataTable generateMyDataTable(Query query, HttpServletRequest req) throws TypeMismatchException { String tableID = req.getParameter("tableId"); if ((tableID != null) && (tableID.endsWith("daily") || tableID.endsWith("hourly"))) { LOG.info("calling generate table stats"); return generateTableStatsTable(query,tableID); } LOG.error("Received request for unknown table:" + tableID); return null; } void addCellToRow(CrawlerStats struct,TableRow row,ColumnDescription column,boolean isHouryValue) { String columnName = column.getId(); if (columnName.equals("timeframe")) { if (isHouryValue) { row.addCell(new Hour(new Date(struct.getTimestamp())).toString()); } else { row.addCell(new Day(new Date(struct.getTimestamp())).toString()); } } else if (columnName.equals("tsdaily")) { Date date = new Date(struct.getTimestamp()); GregorianCalendar calendar = new GregorianCalendar(TimeZone.getTimeZone("GMT")); calendar.setTime(date); row.addCell(new DateValue(calendar)); } else if (columnName.equals("tshourly")) { Date date = new Date(struct.getTimestamp()); GregorianCalendar calendar = new GregorianCalendar(TimeZone.getTimeZone("GMT")); calendar.setTime(date); row.addCell(new DateTimeValue(calendar)); } else { String propertyName = columnNameToFieldNameMap.get(columnName); if (propertyName != null) { switch (column.getType()) { case BOOLEAN: row.addCell(_crawlStatsHelper.getDoubleValueGivenName(struct, propertyName) == 1.0); break; case NUMBER: row.addCell(Math.max(0.0, _crawlStatsHelper.getDoubleValueGivenName(struct, propertyName))); break; case TEXT: row.addCell(_crawlStatsHelper.getStringValueGivenName(struct, propertyName)); break; case DATE: { long date = (long)_crawlStatsHelper.getDoubleValueGivenName(struct, propertyName); GregorianCalendar calendar = new GregorianCalendar(TimeZone.getTimeZone("GMT")); calendar.setTime(new Date(date)); row.addCell(new DateValue(calendar)); } break; case DATETIME: { long date = (long)_crawlStatsHelper.getDoubleValueGivenName(struct, propertyName); GregorianCalendar calendar = new GregorianCalendar(TimeZone.getTimeZone("GMT")); calendar.setTime(new Date(date)); row.addCell(new DateTimeValue(calendar)); }break; case TIMEOFDAY: { long date = (long)_crawlStatsHelper.getDoubleValueGivenName(struct, propertyName); GregorianCalendar calendar = new GregorianCalendar(TimeZone.getTimeZone("GMT")); calendar.setTime(new Date(date)); row.addCell(new TimeOfDayValue(calendar)); } break; } } else { LOG.error("propertyName for Column Name:" + columnName + " not found!"); } } } /** * Returns urlstats table * * * @param query The selection query. * * @return A data table of url stats by crawler */ private DataTable generateTableStatsTable(Query query,final String tableID) throws TypeMismatchException { final Multimap<Date,CrawlerStats> values = ArrayListMultimap.create(); ImmutableSortedSet.Builder<String> builder = ImmutableSortedSet.naturalOrder(); LOG.info("Collecting Collection Names"); // populate the map .. synchronized (CrawlStatsCollectorService._statsCollectionMap) { for (String collectionName : CrawlStatsCollectorService._statsCollectionMap.keySet()) { LOG.info("Encountered Collection:" + collectionName); if (collectionName.startsWith(CrawlerStatsCollection.GROUP_KEY)) { String uniqueName = StatsLogManager.getUniqueKeyGivenName(collectionName); if (!excludedCrawlers.contains(uniqueName)) { LOG.info("Adding Collection:" + collectionName); builder.add(uniqueName); } } } } final ImmutableSet<String> crawlerNamesSet = builder.build(); LOG.info("Key Set Size is:" + crawlerNamesSet.size()); if (crawlerNamesSet.size() != 0) { LOG.info("Queueing urlstats collection for:" + crawlerNamesSet.size() + " rows"); // ok setup semaphore ... final Semaphore blockingSemaphore = new Semaphore(-(crawlerNamesSet.size() - 1)); // and schedule an async request ... CrawlStatsCollectorService.getSingleton().getEventLoop().setTimer(new Timer(0,false,new Timer.Callback() { @Override public void timerFired(Timer timer) { LOG.info("Async Timer Event Fired"); if (tableID.endsWith("hourly")) { LOG.info("Table Type is hourly - Collecting Hourly Stats"); try { synchronized (CrawlStatsCollectorService._statsCollectionMap) { for (Entry<String,StatsCollection> item : CrawlStatsCollectorService._statsCollectionMap.entrySet()) { if (crawlerNamesSet.contains(StatsLogManager.getUniqueKeyGivenName(item.getKey()))) { item.getValue().collectHourlyStats(values); } } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { LOG.info("Releasing Semaphores"); blockingSemaphore.release(crawlerNamesSet.size()); } } else { LOG.info("Table Type is daily - Collecting Daily Stats"); synchronized (CrawlStatsCollectorService._statsCollectionMap) { for (Entry<String,StatsCollection> item : CrawlStatsCollectorService._statsCollectionMap.entrySet()) { if (crawlerNamesSet.contains(StatsLogManager.getUniqueKeyGivenName(item.getKey()))) { try { item.getValue().collectDailyStats(values, new CallbackWithResult<Boolean>() { @Override public void execute(Boolean result) { LOG.info("Daily Query Completed"); blockingSemaphore.release(); } }); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); blockingSemaphore.release(); } } } } } } })); LOG.info("Waiting for Async Completion of request"); // ok wait for async response .. blockingSemaphore.acquireUninterruptibly(); // create new data table instance DataTable data = new DataTable(); // set table id property data.setCustomProperty("tableid", tableID); // figure out hourly or daily boolean hourly = tableID.endsWith("hourly"); String tableNameParts[] = tableID.split("-"); // if this is a crawler specific table if (tablesWithCrawlerStats.contains(tableNameParts[0])) { // get corresponding property name in rpc struct String propertyName = crawlerStatTableToField.get(tableNameParts[0]); // pivot on crawler name ... if (hourly) { data.addColumn(URLSTATS_TABLE_COLUMNS[HourlyTimestampColumnIndex]); } else { data.addColumn(URLSTATS_TABLE_COLUMNS[DailyTimestampColumnIndex]); } // add total column data.addColumn(new ColumnDescription("allcrawlers", ValueType.NUMBER, "all crawlers")); // add crawler names as columns ... for (String crawlerName : crawlerNamesSet) { data.addColumn(new ColumnDescription(crawlerName, ValueType.NUMBER, crawlerName)); } // now walk date values in order for (Date date : values.keySet()) { // add new row for each timestamp TableRow row = new TableRow(); // add timestamp cell if (hourly) { GregorianCalendar calendar = new GregorianCalendar(TimeZone.getTimeZone("GMT")); calendar.setTime(date); row.addCell(new DateTimeValue(calendar)); } else { GregorianCalendar calendar = new GregorianCalendar(TimeZone.getTimeZone("GMT")); calendar.setTime(date); row.addCell(new DateValue(calendar)); } double aggregateValue = 0.0; // now extract values sorted by crawler for this date HashMap<String,CrawlerStats> crawlStatsByCrawler = new HashMap<String,CrawlerStats>(); for (CrawlerStats stats : values.get(date)) { LOG.info("Got Daily Stats for Date:" + date + " Crawler:" + stats.getCrawlerName()); crawlStatsByCrawler.put(stats.getCrawlerName(),stats); aggregateValue += _crawlStatsHelper.getDoubleValueGivenName(stats, propertyName); } if (isAverageStatTable.contains(tableNameParts[0])) { aggregateValue /= values.get(date).size(); } // add aggregate results row.addCell(aggregateValue); // now walk in crawler name order for (String crawlerName : crawlerNamesSet) { CrawlerStats crawlerStats = crawlStatsByCrawler.get(crawlerName); if (crawlerStats != null) { row.addCell((double)_crawlStatsHelper.getDoubleValueGivenName(crawlerStats, propertyName)); } else { row.addCell((double)0.0); } } data.addRow(row); } } else { ImmutableSortedSet.Builder<CrawlerStats> aggregatedStatsBuilder = new ImmutableSortedSet.Builder<CrawlerStats>(new Comparator<CrawlerStats>() { @Override public int compare(CrawlerStats o1, CrawlerStats o2) { return o1.compareTo(o2); } }); LOG.info("Request Completed. Building Aggregated Stats"); // ok aggregate stats for (Date date : values.keySet()) { // aggregate values for date CrawlerStats finalStat = CrawlerStatsCollection.combineValues(values.get(date)); // set final timestamp finalStat.setTimestamp(date.getTime()); // add to builder aggregatedStatsBuilder.add(finalStat); } // build final set ... ImmutableSortedSet<CrawlerStats> aggregatedStats = aggregatedStatsBuilder.build(); LOG.info("Aggregation Completed Result Count:" + aggregatedStats.size()); // get requested columns List<ColumnDescription> requiredColumns = getRequiredColumns(query,URLSTATS_TABLE_COLUMNS); // add columns to data table data.addColumns(requiredColumns); // now collect results for (CrawlerStats object : aggregatedStats) { TableRow row = new TableRow(); for (ColumnDescription selectionColumn : requiredColumns) { addCellToRow(object,row,selectionColumn,hourly); } LOG.info("Added Row:" + row.toString() + " " + row.getCell(0).getValue().toString()); int cellIndex = 0; for (TableCell cell : row.getCells()) { LOG.info("Cell:" + cellIndex++ + " Value:" + cell.getValue()); } data.addRow(row); } } return data; } return null; } /** * Returns a list of required columns based on the query and the actual * columns. * * @param query The user selection query. * @param availableColumns The list of possible columns. * * @return A List of required columns for the requested data table. */ private List<ColumnDescription> getRequiredColumns(Query query, ColumnDescription[] availableColumns) { // Required columns List<ColumnDescription> requiredColumns = Lists.newArrayList(); for (ColumnDescription column : availableColumns) { if (isColumnRequested(query, column.getId())) { requiredColumns.add(column); } } return requiredColumns; } }