DomainListQuery.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.service.queryserver.query;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.Semaphore;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator;
import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator;
import org.commoncrawl.hadoop.mergeutils.SequenceFileMerger;
import org.commoncrawl.hadoop.mergeutils.SequenceFileReader;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.protocol.SubDomainMetadata;
import org.commoncrawl.service.queryserver.DomainListQueryInfo;
import org.commoncrawl.service.queryserver.ShardIndexHostNameTuple;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2;
import org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.MasterDatabaseIndex;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.SlaveDatabaseIndex;
import org.commoncrawl.util.CCStringUtils;

/**
 * 
 * @author rana
 *
 */
public class DomainListQuery extends Query<DomainListQueryInfo,Text,SubDomainMetadata> {

  private static final Log LOG = LogFactory.getLog(DomainListQuery.class);

  
  public final static String SORT_BY_NAME      = "NAME";
  public final static String SORT_BY_URL_COUNT = "URLCOUNT";
  
  private final String getOutputFileNameBasedOnSortByField(String sortByField) throws IOException { 
    if(sortByField.equals(SORT_BY_NAME)) { 
      return "DATA_" + SORT_BY_NAME;
    }
    else if (sortByField.equals(SORT_BY_URL_COUNT)) { 
      return "DATA_" + SORT_BY_URL_COUNT;
    }
    throw new IOException(sortByField +" is an INVALID SORT FIELD");
  }
  
  private final String getMergedResultsFileName() { 
    return "DATA_" + SORT_BY_NAME;
  }
  
  public DomainListQuery() { 
    
  }
  
  public DomainListQuery(DomainListQueryInfo queryInfo) { 
    setQueryData(queryInfo);
  }
  

  @Override
  public String getCanonicalId() {
    return encodePatternAsFilename("DLQ:" + Query.encodePatternAsFilename(getQueryData().getSearchPattern())); 
  }




  @Override
  protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop,File tempFirDir,QueryRequest<DomainListQueryInfo,Text,SubDomainMetadata> requestObject) throws IOException {
    
    
    Path mergeResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+getMergedResultsFileName());
    
    LOG.info("Execute Local called for Query:" + getQueryId() +" MergeResultsPath is:" + mergeResultsPath);
    
    // get a local file system object
    FileSystem localFileSystem = FileSystem.getLocal(conf);
    
    //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists");
    // if source merged results path does not exist ... 
    if (!localFileSystem.exists(mergeResultsPath)) {
      LOG.info("Execute Local for Query:" + getQueryId() +" Source MergeFile:" + mergeResultsPath + " Not Found. Checking for parts files");
      // collect parts ...
      Vector<Path> parts = new Vector<Path>();
      
      FileStatus fileStatusArray[] = remoteFileSystem.globStatus(new Path(getHDFSQueryResultsPath(),"part-*"));
      
      if(fileStatusArray.length == 0) {
        LOG.error("Execute Local for Query:" + getQueryId() +" FAILED. No Parts Files Found!");
        throw new IOException("Remote Component Part Files Not Found");
      }

      for (FileStatus part : fileStatusArray) { 
        //LOG.info("Found Part:"+ part);
        parts.add(part.getPath());
      }
      
      LOG.info("Execute Local for Query:" + getQueryId() +" Initializing Merger");
      SequenceFileSpillWriter<Text,SubDomainMetadata> mergedFileSpillWriter 
      	= new SequenceFileSpillWriter<Text,SubDomainMetadata>(localFileSystem,conf,mergeResultsPath,Text.class,SubDomainMetadata.class,
      			new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath))
      			,false);
      
      try { 
	      SequenceFileMerger<Text,SubDomainMetadata> merger 
	        = new SequenceFileMerger<Text,SubDomainMetadata>(
	            remoteFileSystem,
	            conf,
	            parts,
	            mergedFileSpillWriter,
	            Text.class,
	            SubDomainMetadata.class,
	            
	            new RawKeyValueComparator<Text,SubDomainMetadata>() {
	
	            	DataInputBuffer key1Stream = new DataInputBuffer();
	            	DataInputBuffer key2Stream = new DataInputBuffer();
	            	
								@Override
	              public int compareRaw(byte[] key1Data, int key1Offset,
	                  int key1Length, byte[] key2Data, int key2Offset,
	                  int key2Length, byte[] value1Data, int value1Offset,
	                  int value1Length, byte[] value2Data, int value2Offset,
	                  int value2Length) throws IOException {
	
									key1Stream.reset(key1Data, key1Offset, key1Length);
									key2Stream.reset(key2Data, key2Offset, key2Length);
									
									WritableUtils.readVInt(key1Stream);
									WritableUtils.readVInt(key2Stream);
																	
		              return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(), key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(), key2Length - key2Stream.getPosition());
	              }
	
								@Override
	              public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) {
		              return key1.compareTo(key2);
	              }
	            	
							}
	        		);
	      
	      try { 
	        LOG.info("Execute Local for Query:" + getQueryId() +" Running Merger");
	        merger.mergeAndSpill(null);
	        LOG.info("Execute Local for Query:" + getQueryId() +" Merge Successfull.. Deleting Merge Inputs");
	        for (Path inputPath : parts) { 
	          remoteFileSystem.delete(inputPath,false);
	        }
	      }
	      catch (IOException e){ 
	        LOG.error("Execute Local for Query:" + getQueryId() +" Merge Failed with Exception:" + CCStringUtils.stringifyException(e));
	        throw e;
	      }
	      finally { 
	      	LOG.info("** CLOSING MERGER");
	        merger.close();
	      }
      }
      finally {
      	LOG.info("** FLUSHING SPILLWRITER");
      	mergedFileSpillWriter.close();
      }
    }
    
    // now check for query specific merge file ...
    Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField()));
    
    LOG.info("Execute Local for Query:" + getQueryId() +" Checking for QueryResultsPath:" + queryResultsPath);
    
    if (!localFileSystem.exists(queryResultsPath)) {
    
      LOG.info("Exectue Local for Query:" + getQueryId() +" Results File:" + queryResultsPath + " does not exist. Running sort and merge process");

      LOG.info("Execute Local for Query:" + getQueryId() +" Allocating SpillWriter with output to:" + queryResultsPath);
      // allocate a spill writer ...  
      SequenceFileSpillWriter<Text,SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text,SubDomainMetadata>(localFileSystem,conf,queryResultsPath,Text.class,SubDomainMetadata.class,
      		new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)),
      		false);
      
      try { 
        
        LOG.info("Execute Local for Query:" + getQueryId() +" Allocating MergeSortSpillWriter");
        // and connect it to the merge spill writer ...
        MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>(
            conf,
            sortedResultsFileSpillWriter,
            localFileSystem,
            new Path(tempFirDir.getAbsolutePath()),
            /*
            new RawKeyValueComparator<Text,SubDomainMetadata>() {
  
              SubDomainMetadata value1 = new SubDomainMetadata();
              SubDomainMetadata value2 = new SubDomainMetadata();
              
  
              @Override
              public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) {
                return value1.getUrlCount() - value2.getUrlCount();
              }

              @Override
              public int compareRaw(byte[] key1Data, int key1Offset,
                  int key1Length, byte[] key2Data, int key2Offset,
                  int key2Length, byte[] value1Data, int value1Offset,
                  int value1Length, byte[] value2Data, int value2Offset,
                  int value2Length) throws IOException {
  
                value1.clear();
                value2.clear();
                
                value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
                value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
                
                return compare(null, value1, null, value2);
              } 
              
            },
            */
            new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() {


							@Override
              public void generateOptimizedKeyForPair(
                  Text key,
                  SubDomainMetadata value,
                  org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut)
                  throws IOException {
	              optimizedKeyOut.setLongKeyValue(value.getUrlCount());
              }

							@Override
              public int getGeneratedKeyType() {
	              return OptimizedKey.KEY_TYPE_LONG;
              }
            },
            Text.class,
            SubDomainMetadata.class,false,null);
        
        try { 
          
          // create a vector representing the single input segment 
          Vector<Path> singleInputSegment = new Vector<Path>();
          
          LOG.info("Execute Local for Query:" + getQueryId() +" Adding MergeResultsPath:" + mergeResultsPath + " as input for Merger");
          singleInputSegment.add(mergeResultsPath);
          
          // create a SequenceFileReader
          SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>(
              localFileSystem,
              conf,
              singleInputSegment,
              mergeSortSpillWriter,
              Text.class,
              SubDomainMetadata.class);
              
          try { 
            LOG.info("Execute Local for Query:" + getQueryId() +" calling readAndSpill");
            mergeSegmentReader.readAndSpill();
            LOG.info("Execute Local for Query:" + getQueryId() +" readAndSpill finished");
          }
          finally { 
            if (mergeSegmentReader != null) { 
              mergeSegmentReader.close();
            }
          }
          
        }
        finally { 
          if (mergeSortSpillWriter != null) { 
            mergeSortSpillWriter.close();
          }
        }
        
      }
      finally { 
        if (sortedResultsFileSpillWriter != null) { 
          sortedResultsFileSpillWriter.close();
        }
      }
    }

    //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath);
    PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(localFileSystem,queryResultsPath,Text.class,SubDomainMetadata.class);
    //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount());
    
    return indexFile.getRecordCount();
  }

  
  
  @Override
  public void getCachedResults(
      FileSystem fileSystem,
      Configuration conf,
      EventLoop eventLoop,
      MasterDatabaseIndex masterIndex,
      QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> theClientRequest,
      QueryCompletionCallback<DomainListQueryInfo, Text, SubDomainMetadata> callback)
      throws IOException {
  	
    LOG.info("getCachedResults called for Query:" + getQueryId());
    /*
    LOG.info("Retrieving Cached Results for Query:" + theClientRequest.getClientQueryInfo().getClientQueryId());
    LOG.info("Sort Field:" + theClientRequest.getClientQueryInfo().getSortByField());
    LOG.info("Sort Order:" + theClientRequest.getClientQueryInfo().getSortOrder());
    LOG.info("Pagination Offset:" + theClientRequest.getClientQueryInfo().getPaginationOffset());
    LOG.info("Page Size:" + theClientRequest.getClientQueryInfo().getPageSize());
    */
    FileSystem localFileSystem = FileSystem.getLocal(conf);
        
    String sortByField = theClientRequest.getClientQueryInfo().getSortByField();
    
    if (sortByField.equalsIgnoreCase(SORT_BY_NAME) || sortByField.equalsIgnoreCase(SORT_BY_URL_COUNT)) {
      
      Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+getOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField()));
      
      //LOG.info("Initializing index reader for outputFile:" + outputFileName);
      Path indexFileName = PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFileName);
      //LOG.info("Index FileName is:" + indexFileName);
      
      PositionBasedSequenceFileIndex<Text, SubDomainMetadata> index = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(localFileSystem,indexFileName,Text.class,SubDomainMetadata.class);
      
      QueryResult<Text,SubDomainMetadata> resultOut = new QueryResult<Text,SubDomainMetadata>(); 
      
      LOG.info("getCachedResults called for Query:" + getQueryId() +" Calling ReadPaginationResults");
      index.readPaginatedResults(localFileSystem, conf,
          theClientRequest.getClientQueryInfo().getSortOrder(),
          theClientRequest.getClientQueryInfo().getPaginationOffset(),
          theClientRequest.getClientQueryInfo().getPageSize(),
          resultOut);
      
      LOG.info("getCachedResults called for Query:" + getQueryId() +". Initiating getCachedResults Callback");
      callback.queryComplete(theClientRequest,resultOut);
    }
  }

  @Override
  protected long executeRemote(
      final FileSystem fileSystem,
      final Configuration conf,
      EventLoop eventLoop,
      SlaveDatabaseIndex instanceIndex,
      File tempFirDir,
      QueryProgressCallback<DomainListQueryInfo, Text, SubDomainMetadata> progressCallback)
      throws IOException {

  		int shardsProcessed = 0;
  		
  		// ok create a semaphore for the number of shard we are going to query ...
  		final Semaphore semaphore = new Semaphore(-(getCommonQueryInfo().getRelevantShardIds().size()-1));
  		// and create a record count array 
  		final long recordCounts[] = new long[getCommonQueryInfo().getRelevantShardIds().size()];
  		final IOException exceptions[] = new IOException[getCommonQueryInfo().getRelevantShardIds().size()];

  		int threadIdx = 0;
  		// ok dispatch queries for each shard we are responsible for ... 
  		for (int shardId : getCommonQueryInfo().getRelevantShardIds()) {
  			
  			final int currentShardId = shardId;
  			final int currentThreadIdx = threadIdx++;
  			
  			Thread subQueryThread = new Thread(new Runnable() {

					@Override
          public void run() {
		  			Path shardOutputPath = getHDFSQueryResultsFilePathForShard(currentShardId);
		  			
			      LOG.info("Execute Remote for Query:" + getQueryId() +" for shardId:" + currentShardId+ "  Creating spill file @:" + shardOutputPath);
			      
			      try { 
				      // create SequenceFile Spill Writer ... 
				      SequenceFileSpillWriter<Text, SubDomainMetadata> spillWriter 
				        = new SequenceFileSpillWriter<Text, SubDomainMetadata>(fileSystem,conf,shardOutputPath,Text.class,SubDomainMetadata.class,null,true);
				      try {
				        LOG.info("Execute Remote for Query:" + getQueryId() +" calling executeDomainListQuery on index");
				        // scan index for matching patterns ... spill into writer ...
				        recordCounts[currentThreadIdx] += _slaveDatabaseIndex.queryDomainsGivenPattern(getQueryData().getSearchPattern(), currentShardId, spillWriter);
				        LOG.info("Execute Remote for Query:" + getQueryId() +" executeDomainListQuery returned:" + recordCounts[currentThreadIdx]);
				      }
				      finally { 
				        spillWriter.close();
				        // increment semaphore count 
				        semaphore.release();
				      }
			      }
			      catch (IOException e) {
			        LOG.error("Execute Remote for Query:" + getQueryId() +" executeDomainListQuery failed with error:" + CCStringUtils.stringifyException(e));
			        exceptions[currentThreadIdx] = e;
			      }
          }
  			});
  			subQueryThread.start();
  		}
  		
  		// ok block until all queries are complete
  		LOG.info("Query:" + getQueryId() + " Waiting on Worker Threads");
  		semaphore.acquireUninterruptibly();
  		LOG.info("Query:" + getQueryId() + " All Threads Compelted");
  		
  		for (IOException e : exceptions) { 
  			if (e != null) { 
  				LOG.error("Query:" + getQueryId() + " Failed with Exception:" + CCStringUtils.stringifyException(e));
  				throw e;
  			}
  		}
  		long cumilativeRecordCount = 0L;
  		for (long recordCount : recordCounts)
  			cumilativeRecordCount += recordCount;
      return cumilativeRecordCount;
  }
  
  @Override
  public boolean cachedResultsAvailable(FileSystem fileSystem,Configuration conf, QueryRequest theClientRequest) throws IOException {
    
    FileSystem localFileSystem = FileSystem.getLocal(conf);
    
    Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+getOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField()));
    
    //LOG.info("Cached Results Available called for Query:" + theClientRequest.getSourceQuery().getQueryId() + ". Checking Path:" + outputFileName);
    //Path indexFileName  = new Path(outputFileName.toString() + ".IDX");
    boolean result = localFileSystem.exists(outputFileName);
    //LOG.info("Cached Results Available called for Query:" + theClientRequest.getSourceQuery().getQueryId() + ". returning:" + result);
   
    return result;
  }

  @Override
  public boolean requiresRemoteDispatch(FileSystem fileSystem,
      Configuration conf, ShardMapper shardMapper,
      QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> theClientRequest,
      ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping)
      throws IOException {
    
  	// get shard mappings for index ... 
  	shardIdToHostNameMapping.addAll(shardMapper.mapShardIdsForIndex(DatabaseIndexV2.MasterDatabaseIndex.INDEX_NAME_DOMAIN_NAME_TO_METADATA));
  	
    // create a set representing the collection of parts required to complete this query ... 
    Set<String> requiredParts = new HashSet<String>();
    
    for (ShardIndexHostNameTuple tuple : shardIdToHostNameMapping) { 
      requiredParts.add(getPartNameForSlave(tuple.getShardId()));
    }
    
    // now iterate parts available on hdfs ... 
    Path remoteQueryPath = getHDFSQueryResultsPath();
    //LOG.info("Results Path is:" + remoteQueryPath);
    
    FileStatus availableParts[] = fileSystem.globStatus(new Path(remoteQueryPath,"part-*"));
    
    for (FileStatus part : availableParts) { 
      //LOG.info("Found Path:" + part.getPath());
      requiredParts.remove(part.getPath().getName());
    }
    
    // now check to see if all parts are available 
    if (requiredParts.size() != 0) { 
      for (String part: requiredParts) { 
        LOG.info("Required remote part:" + part + " NOT available yet.");
      }
      return true;
    }
    else { 
      LOG.info("All parts required for query available.");
      return false;
    }
  }
  
  
}