/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.queryserver.query; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; import java.util.Vector; import java.util.concurrent.Semaphore; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.commoncrawl.async.EventLoop; import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter; import org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator; import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator; import org.commoncrawl.hadoop.mergeutils.SequenceFileMerger; import org.commoncrawl.hadoop.mergeutils.SequenceFileReader; import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter; import org.commoncrawl.protocol.SubDomainMetadata; import org.commoncrawl.service.queryserver.DomainListQueryInfo; import org.commoncrawl.service.queryserver.ShardIndexHostNameTuple; import org.commoncrawl.service.queryserver.index.DatabaseIndexV2; import org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex; import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.MasterDatabaseIndex; import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.SlaveDatabaseIndex; import org.commoncrawl.util.CCStringUtils; /** * * @author rana * */ public class DomainListQuery extends Query<DomainListQueryInfo,Text,SubDomainMetadata> { private static final Log LOG = LogFactory.getLog(DomainListQuery.class); public final static String SORT_BY_NAME = "NAME"; public final static String SORT_BY_URL_COUNT = "URLCOUNT"; private final String getOutputFileNameBasedOnSortByField(String sortByField) throws IOException { if(sortByField.equals(SORT_BY_NAME)) { return "DATA_" + SORT_BY_NAME; } else if (sortByField.equals(SORT_BY_URL_COUNT)) { return "DATA_" + SORT_BY_URL_COUNT; } throw new IOException(sortByField +" is an INVALID SORT FIELD"); } private final String getMergedResultsFileName() { return "DATA_" + SORT_BY_NAME; } public DomainListQuery() { } public DomainListQuery(DomainListQueryInfo queryInfo) { setQueryData(queryInfo); } @Override public String getCanonicalId() { return encodePatternAsFilename("DLQ:" + Query.encodePatternAsFilename(getQueryData().getSearchPattern())); } @Override protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop,File tempFirDir,QueryRequest<DomainListQueryInfo,Text,SubDomainMetadata> requestObject) throws IOException { Path mergeResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+getMergedResultsFileName()); LOG.info("Execute Local called for Query:" + getQueryId() +" MergeResultsPath is:" + mergeResultsPath); // get a local file system object FileSystem localFileSystem = FileSystem.getLocal(conf); //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists"); // if source merged results path does not exist ... if (!localFileSystem.exists(mergeResultsPath)) { LOG.info("Execute Local for Query:" + getQueryId() +" Source MergeFile:" + mergeResultsPath + " Not Found. Checking for parts files"); // collect parts ... Vector<Path> parts = new Vector<Path>(); FileStatus fileStatusArray[] = remoteFileSystem.globStatus(new Path(getHDFSQueryResultsPath(),"part-*")); if(fileStatusArray.length == 0) { LOG.error("Execute Local for Query:" + getQueryId() +" FAILED. No Parts Files Found!"); throw new IOException("Remote Component Part Files Not Found"); } for (FileStatus part : fileStatusArray) { //LOG.info("Found Part:"+ part); parts.add(part.getPath()); } LOG.info("Execute Local for Query:" + getQueryId() +" Initializing Merger"); SequenceFileSpillWriter<Text,SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text,SubDomainMetadata>(localFileSystem,conf,mergeResultsPath,Text.class,SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)) ,false); try { SequenceFileMerger<Text,SubDomainMetadata> merger = new SequenceFileMerger<Text,SubDomainMetadata>( remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class, new RawKeyValueComparator<Text,SubDomainMetadata>() { DataInputBuffer key1Stream = new DataInputBuffer(); DataInputBuffer key2Stream = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { key1Stream.reset(key1Data, key1Offset, key1Length); key2Stream.reset(key2Data, key2Offset, key2Length); WritableUtils.readVInt(key1Stream); WritableUtils.readVInt(key2Stream); return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(), key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(), key2Length - key2Stream.getPosition()); } @Override public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) { return key1.compareTo(key2); } } ); try { LOG.info("Execute Local for Query:" + getQueryId() +" Running Merger"); merger.mergeAndSpill(null); LOG.info("Execute Local for Query:" + getQueryId() +" Merge Successfull.. Deleting Merge Inputs"); for (Path inputPath : parts) { remoteFileSystem.delete(inputPath,false); } } catch (IOException e){ LOG.error("Execute Local for Query:" + getQueryId() +" Merge Failed with Exception:" + CCStringUtils.stringifyException(e)); throw e; } finally { LOG.info("** CLOSING MERGER"); merger.close(); } } finally { LOG.info("** FLUSHING SPILLWRITER"); mergedFileSpillWriter.close(); } } // now check for query specific merge file ... Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField())); LOG.info("Execute Local for Query:" + getQueryId() +" Checking for QueryResultsPath:" + queryResultsPath); if (!localFileSystem.exists(queryResultsPath)) { LOG.info("Exectue Local for Query:" + getQueryId() +" Results File:" + queryResultsPath + " does not exist. Running sort and merge process"); LOG.info("Execute Local for Query:" + getQueryId() +" Allocating SpillWriter with output to:" + queryResultsPath); // allocate a spill writer ... SequenceFileSpillWriter<Text,SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text,SubDomainMetadata>(localFileSystem,conf,queryResultsPath,Text.class,SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)), false); try { LOG.info("Execute Local for Query:" + getQueryId() +" Allocating MergeSortSpillWriter"); // and connect it to the merge spill writer ... MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>( conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()), /* new RawKeyValueComparator<Text,SubDomainMetadata>() { SubDomainMetadata value1 = new SubDomainMetadata(); SubDomainMetadata value2 = new SubDomainMetadata(); @Override public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) { return value1.getUrlCount() - value2.getUrlCount(); } @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { value1.clear(); value2.clear(); value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length))); value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length))); return compare(null, value1, null, value2); } }, */ new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() { @Override public void generateOptimizedKeyForPair( Text key, SubDomainMetadata value, org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut) throws IOException { optimizedKeyOut.setLongKeyValue(value.getUrlCount()); } @Override public int getGeneratedKeyType() { return OptimizedKey.KEY_TYPE_LONG; } }, Text.class, SubDomainMetadata.class,false,null); try { // create a vector representing the single input segment Vector<Path> singleInputSegment = new Vector<Path>(); LOG.info("Execute Local for Query:" + getQueryId() +" Adding MergeResultsPath:" + mergeResultsPath + " as input for Merger"); singleInputSegment.add(mergeResultsPath); // create a SequenceFileReader SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>( localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class, SubDomainMetadata.class); try { LOG.info("Execute Local for Query:" + getQueryId() +" calling readAndSpill"); mergeSegmentReader.readAndSpill(); LOG.info("Execute Local for Query:" + getQueryId() +" readAndSpill finished"); } finally { if (mergeSegmentReader != null) { mergeSegmentReader.close(); } } } finally { if (mergeSortSpillWriter != null) { mergeSortSpillWriter.close(); } } } finally { if (sortedResultsFileSpillWriter != null) { sortedResultsFileSpillWriter.close(); } } } //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath); PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(localFileSystem,queryResultsPath,Text.class,SubDomainMetadata.class); //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount()); return indexFile.getRecordCount(); } @Override public void getCachedResults( FileSystem fileSystem, Configuration conf, EventLoop eventLoop, MasterDatabaseIndex masterIndex, QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> theClientRequest, QueryCompletionCallback<DomainListQueryInfo, Text, SubDomainMetadata> callback) throws IOException { LOG.info("getCachedResults called for Query:" + getQueryId()); /* LOG.info("Retrieving Cached Results for Query:" + theClientRequest.getClientQueryInfo().getClientQueryId()); LOG.info("Sort Field:" + theClientRequest.getClientQueryInfo().getSortByField()); LOG.info("Sort Order:" + theClientRequest.getClientQueryInfo().getSortOrder()); LOG.info("Pagination Offset:" + theClientRequest.getClientQueryInfo().getPaginationOffset()); LOG.info("Page Size:" + theClientRequest.getClientQueryInfo().getPageSize()); */ FileSystem localFileSystem = FileSystem.getLocal(conf); String sortByField = theClientRequest.getClientQueryInfo().getSortByField(); if (sortByField.equalsIgnoreCase(SORT_BY_NAME) || sortByField.equalsIgnoreCase(SORT_BY_URL_COUNT)) { Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+getOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField())); //LOG.info("Initializing index reader for outputFile:" + outputFileName); Path indexFileName = PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFileName); //LOG.info("Index FileName is:" + indexFileName); PositionBasedSequenceFileIndex<Text, SubDomainMetadata> index = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(localFileSystem,indexFileName,Text.class,SubDomainMetadata.class); QueryResult<Text,SubDomainMetadata> resultOut = new QueryResult<Text,SubDomainMetadata>(); LOG.info("getCachedResults called for Query:" + getQueryId() +" Calling ReadPaginationResults"); index.readPaginatedResults(localFileSystem, conf, theClientRequest.getClientQueryInfo().getSortOrder(), theClientRequest.getClientQueryInfo().getPaginationOffset(), theClientRequest.getClientQueryInfo().getPageSize(), resultOut); LOG.info("getCachedResults called for Query:" + getQueryId() +". Initiating getCachedResults Callback"); callback.queryComplete(theClientRequest,resultOut); } } @Override protected long executeRemote( final FileSystem fileSystem, final Configuration conf, EventLoop eventLoop, SlaveDatabaseIndex instanceIndex, File tempFirDir, QueryProgressCallback<DomainListQueryInfo, Text, SubDomainMetadata> progressCallback) throws IOException { int shardsProcessed = 0; // ok create a semaphore for the number of shard we are going to query ... final Semaphore semaphore = new Semaphore(-(getCommonQueryInfo().getRelevantShardIds().size()-1)); // and create a record count array final long recordCounts[] = new long[getCommonQueryInfo().getRelevantShardIds().size()]; final IOException exceptions[] = new IOException[getCommonQueryInfo().getRelevantShardIds().size()]; int threadIdx = 0; // ok dispatch queries for each shard we are responsible for ... for (int shardId : getCommonQueryInfo().getRelevantShardIds()) { final int currentShardId = shardId; final int currentThreadIdx = threadIdx++; Thread subQueryThread = new Thread(new Runnable() { @Override public void run() { Path shardOutputPath = getHDFSQueryResultsFilePathForShard(currentShardId); LOG.info("Execute Remote for Query:" + getQueryId() +" for shardId:" + currentShardId+ " Creating spill file @:" + shardOutputPath); try { // create SequenceFile Spill Writer ... SequenceFileSpillWriter<Text, SubDomainMetadata> spillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(fileSystem,conf,shardOutputPath,Text.class,SubDomainMetadata.class,null,true); try { LOG.info("Execute Remote for Query:" + getQueryId() +" calling executeDomainListQuery on index"); // scan index for matching patterns ... spill into writer ... recordCounts[currentThreadIdx] += _slaveDatabaseIndex.queryDomainsGivenPattern(getQueryData().getSearchPattern(), currentShardId, spillWriter); LOG.info("Execute Remote for Query:" + getQueryId() +" executeDomainListQuery returned:" + recordCounts[currentThreadIdx]); } finally { spillWriter.close(); // increment semaphore count semaphore.release(); } } catch (IOException e) { LOG.error("Execute Remote for Query:" + getQueryId() +" executeDomainListQuery failed with error:" + CCStringUtils.stringifyException(e)); exceptions[currentThreadIdx] = e; } } }); subQueryThread.start(); } // ok block until all queries are complete LOG.info("Query:" + getQueryId() + " Waiting on Worker Threads"); semaphore.acquireUninterruptibly(); LOG.info("Query:" + getQueryId() + " All Threads Compelted"); for (IOException e : exceptions) { if (e != null) { LOG.error("Query:" + getQueryId() + " Failed with Exception:" + CCStringUtils.stringifyException(e)); throw e; } } long cumilativeRecordCount = 0L; for (long recordCount : recordCounts) cumilativeRecordCount += recordCount; return cumilativeRecordCount; } @Override public boolean cachedResultsAvailable(FileSystem fileSystem,Configuration conf, QueryRequest theClientRequest) throws IOException { FileSystem localFileSystem = FileSystem.getLocal(conf); Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+getOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField())); //LOG.info("Cached Results Available called for Query:" + theClientRequest.getSourceQuery().getQueryId() + ". Checking Path:" + outputFileName); //Path indexFileName = new Path(outputFileName.toString() + ".IDX"); boolean result = localFileSystem.exists(outputFileName); //LOG.info("Cached Results Available called for Query:" + theClientRequest.getSourceQuery().getQueryId() + ". returning:" + result); return result; } @Override public boolean requiresRemoteDispatch(FileSystem fileSystem, Configuration conf, ShardMapper shardMapper, QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> theClientRequest, ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping) throws IOException { // get shard mappings for index ... shardIdToHostNameMapping.addAll(shardMapper.mapShardIdsForIndex(DatabaseIndexV2.MasterDatabaseIndex.INDEX_NAME_DOMAIN_NAME_TO_METADATA)); // create a set representing the collection of parts required to complete this query ... Set<String> requiredParts = new HashSet<String>(); for (ShardIndexHostNameTuple tuple : shardIdToHostNameMapping) { requiredParts.add(getPartNameForSlave(tuple.getShardId())); } // now iterate parts available on hdfs ... Path remoteQueryPath = getHDFSQueryResultsPath(); //LOG.info("Results Path is:" + remoteQueryPath); FileStatus availableParts[] = fileSystem.globStatus(new Path(remoteQueryPath,"part-*")); for (FileStatus part : availableParts) { //LOG.info("Found Path:" + part.getPath()); requiredParts.remove(part.getPath().getName()); } // now check to see if all parts are available if (requiredParts.size() != 0) { for (String part: requiredParts) { LOG.info("Required remote part:" + part + " NOT available yet."); } return true; } else { LOG.info("All parts required for query available."); return false; } } }