InverseLinksByDomainQuery.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.queryserver.query;

import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.file.tfile.TFile;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.pipelineV1.InverseLinksByDomainDBBuilder.ComplexKeyComparator;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.service.queryserver.InlinksByDomainQueryInfo;
import org.commoncrawl.service.queryserver.ShardIndexHostNameTuple;
import org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.MasterDatabaseIndex;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.SlaveDatabaseIndex;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FileUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;

/**
 * 
 * @author rana
 *
 */
public class InverseLinksByDomainQuery extends Query<InlinksByDomainQueryInfo,FlexBuffer,URLFPV2>{

	private static final Log LOG = LogFactory.getLog(InverseLinksByDomainQuery.class);
	private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }		
	
  public InverseLinksByDomainQuery() { 
    
  }
  
  public InverseLinksByDomainQuery(InlinksByDomainQueryInfo queryInfo) { 
    setQueryData(queryInfo);
  }
  
  
  static Map<Integer,PositionBasedSequenceFileIndex> _shardToIndexMap = new TreeMap<Integer,PositionBasedSequenceFileIndex>();
	static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs,Configuration conf,long databaseId,long targetRootDomainFP,FileSystem outputFileSystem,Path finalOutputPath) throws IOException {

		File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis());
		tempFile.mkdir();

		try { 
			// create the final output spill writer ...  
			SequenceFileSpillWriter<FlexBuffer,URLFPV2> spillwriter 
				= new SequenceFileSpillWriter<FlexBuffer,URLFPV2>(
						outputFileSystem,conf,finalOutputPath,FlexBuffer.class,URLFPV2.class,
						new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath))
						,true);
			
			
			try {
				
				MergeSortSpillWriter<FlexBuffer,URLFPV2> finalMerger 
				= new MergeSortSpillWriter<FlexBuffer,URLFPV2>(
						conf,
						spillwriter,
						FileSystem.getLocal(conf),						
						new Path(tempFile.getAbsolutePath()),
						null,
						new ComplexKeyComparator(),
						FlexBuffer.class,
						URLFPV2.class,true,null);
			
				try { 
	
					for (int targetShardId=0;targetShardId<CrawlEnvironment.NUM_DB_SHARDS;++targetShardId) {
						// 0. shard domain id to find index file location ... 
						int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS);
						// build path to index file 
						Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId));
						LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath);
						// 1. scan domainFP to index file first
						// 2. given index, scan index->pos file to find scan start position
						// 3. given scan start position, scan forward until fp match is found.
						// 4. collect all matching entries and output to a file ? 
					
								FSDataInputStream indexDataInputStream = fs.open(indexFilePath);
								try { 
									TFile.Reader reader = new TFile.Reader(indexDataInputStream,fs.getFileStatus(indexFilePath).getLen(),conf);
									try { 
										TFile.Reader.Scanner scanner = reader.createScanner();
										
										try { 
											// generate key ... 
											DataOutputBuffer keyBuffer = new DataOutputBuffer();
											keyBuffer.writeLong(targetRootDomainFP);
											if (scanner.seekTo(keyBuffer.getData(),0,keyBuffer.getLength())) {
												// setup for value scan 
												DataInputStream valueStream =  scanner.entry().getValueStream();
												int dataOffsetOut = -1;
												while (valueStream.available() >0) { 
													// read entries looking for our specific entry
													int shardIdx = valueStream.readInt();
													int dataOffset = valueStream.readInt();
													if (shardIdx == targetShardId) { 
														dataOffsetOut = dataOffset;
														break;
													}
												}
												LOG.info("Index Search Yielded:"+ dataOffsetOut);
												if (dataOffsetOut != -1) { 
													// ok create a data path 
													Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId));
													Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index");
													// check to see if index is already loaded ... 
													PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null;
													synchronized(_shardToIndexMap) { 
														index = _shardToIndexMap.get(targetShardId);
													}
													if (index == null) {
														LOG.info("Loading Index from Path:" + finalDataIndexPath);
														// load index
														index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class);
														// put in cache
														synchronized (_shardToIndexMap) {
							                _shardToIndexMap.put(targetShardId, index);
						                }
													}
													
													LOG.info("Initializing Data Reader at Path:" + finalDataPath);
													// ok time to create a reader 
													SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,finalDataPath,conf);
													
													try { 
														LOG.info("Seeking Reader to Index Position:" + dataOffsetOut);
														index.seekReaderToItemAtIndex(dataReader,dataOffsetOut);
														
														FlexBuffer keyBytes 	= new FlexBuffer();
														URLFPV2  	 sourceFP   = new URLFPV2();
														DataInputBuffer keyReader = new DataInputBuffer();
														TextBytes  urlTxt  = new TextBytes();
														
														// ok read to go ... 
														while (dataReader.next(keyBytes,sourceFP)) { 
															// initialize reader 
															keyReader.reset(keyBytes.get(),keyBytes.getOffset(),keyBytes.getCount());
															
															long targetFP = keyReader.readLong();
															
															
															if (targetRootDomainFP == targetFP) { 
																finalMerger.spillRecord(keyBytes, sourceFP);
															}
															else { 
																LOG.info("FP:"+ targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop");
																break;
															}
														}
													}
													finally { 
														LOG.info("Closing Reader");
														dataReader.close();
													}
												}
											}
										}
										finally {
											LOG.info("Closing Scanner");
											scanner.close();
										}
											
									}
									finally { 
										LOG.info("Closing TFile Reader");
										reader.close();
									}
								}
								finally { 
									LOG.info("Closing InputStream");
									indexDataInputStream.close();
								}
							}
					}
					finally { 
						finalMerger.close();
					}
			}
			finally { 
				spillwriter.close();
			}
		}
		catch (IOException e) { 
			LOG.error(CCStringUtils.stringifyException(e));
			FileUtils.recursivelyDeleteFile(tempFile);
		}
		
	}
	
	public static void main(String[] args) {
    // initialize ...
    Configuration conf = new Configuration();
    
    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");
    
    LOG.info("URL:" + args[0] + " ShardId:" + args[1]);

    try { 
	    File tempFile = File.createTempFile("inverseLinksReportTest", "seq");
	    try { 
	    	FileSystem fs = FileSystem.get(conf);
	    	FileSystem localFileSystem = FileSystem.getLocal(conf);
	    	
	    	URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]);
	    	if (fp != null) { 
	  			collectAllTopLevelDomainRecordsByDomain(fs,conf,1282844121161L,fp.getRootDomainHash(),localFileSystem,new Path(tempFile.getAbsolutePath()));
	  			
	  			SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem,new Path(tempFile.getAbsolutePath()) , conf);
	  			try { 
	  				FlexBuffer key = new FlexBuffer();
	  				URLFPV2 	 src = new URLFPV2();
	  				TextBytes  url = new TextBytes();
	  				
	  				DataInputBuffer inputBuffer = new DataInputBuffer();
	  				
	  				while (reader.next(key, src)) { 
	  					inputBuffer.reset(key.get(),key.getOffset(),key.getCount());
	  					long targetFP = inputBuffer.readLong();
	  					float pageRank = inputBuffer.readFloat();
	  					// ok initialize text bytes ... 
	  					int textLen = WritableUtils.readVInt(inputBuffer);
	  					url.set(key.get(), inputBuffer.getPosition(), textLen);
	  					LOG.info("PR:"+ pageRank + " URL:" + url.toString());
	  				}
	  			}
	  			finally { 
	  				reader.close();
	  			}
	    	}
	    }
	    catch (IOException e) { 
	    	LOG.error(CCStringUtils.stringifyException(e));
	    	// tempFile.delete();
	    }
    }
    catch (IOException e) { 
    	LOG.error(CCStringUtils.stringifyException(e));
    }
  }

	@Override
  public boolean cachedResultsAvailable(
      FileSystem fileSystem,
      Configuration conf,
      QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> theClientRequest)
      throws IOException {

		FileSystem localFileSystem = FileSystem.getLocal(conf);
    Path urlOutputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest) + "DATA");
    
    LOG.info("Cached Results Available called for Query:" + theClientRequest.getSourceQuery().getQueryId() + ". Checking Path:" +  urlOutputFileName);
    return localFileSystem.exists(urlOutputFileName);
		
  }

	@Override
  protected long executeRemote(
      FileSystem fileSyste,
      Configuration conf,
      EventLoop eventLoop,
      SlaveDatabaseIndex instanceIndex,
      File tempFirDir,
      QueryProgressCallback<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> progressCallback)
      throws IOException {
	  // TODO Auto-generated method stub
	  return 0;
  }

	@Override
  public void getCachedResults(
      FileSystem fileSystem,
      Configuration conf,
      EventLoop eventLoop,
      MasterDatabaseIndex masterIndex,
      QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> theClientRequest,
      QueryCompletionCallback<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> callback)
      throws IOException {
		
    FileSystem localFileSystem = FileSystem.getLocal(conf);
		
    Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+"DATA");
    
    //LOG.info("Initializing index reader for outputFile:" + outputFileName);
    Path indexFileName = PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFileName);
    //LOG.info("Index FileName is:" + indexFileName);
    
    PositionBasedSequenceFileIndex<FlexBuffer,URLFPV2> index = new PositionBasedSequenceFileIndex<FlexBuffer,URLFPV2>(localFileSystem,indexFileName,FlexBuffer.class,URLFPV2.class);
    
    QueryResult<FlexBuffer,URLFPV2> resultOut = new QueryResult<FlexBuffer,URLFPV2>(); 
    
    LOG.info("getCachedResults called for Query:" + getQueryId() +" Calling ReadPaginationResults");
    index.readPaginatedResults(localFileSystem, conf,
        theClientRequest.getClientQueryInfo().getSortOrder(),
        theClientRequest.getClientQueryInfo().getPaginationOffset(),
        theClientRequest.getClientQueryInfo().getPageSize(),
        resultOut);
    
    LOG.info("getCachedResults called for Query:" + getQueryId() +". Initiating getCachedResults Callback");
    callback.queryComplete(theClientRequest,resultOut);	  
  }
	
  
	@Override
  public String getCanonicalId() {
	  return encodePatternAsFilename("ILBD:" + getQueryData().getDomainName());
  }

	@Override
  public boolean requiresRemoteDispatch(
      FileSystem fileSystem,
      Configuration conf,
      ShardMapper shardMapper,
      QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> theClientRequest,
      ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping)
      throws IOException {
	  return false;
  }
	
	@Override
	protected long executeLocal(
	    FileSystem fileSystem,
	    Configuration conf,
	    MasterDatabaseIndex index,
	    EventLoop eventLoop,
	    File tempFirDir,
	    QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> requestObject)
	    throws IOException {
		
		
		
  	LocalFileSystem localFS = FileSystem.getLocal(conf);
  	
    Path localURLListPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+"DATA");
    Path localURLListIndexPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+"DATA.index");
    
    LOG.info("executeLocal called. Domain:" + getQueryData().getDomainName() + " cacheFilename:" + localURLListPath);
    
    localFS.delete(localURLListPath,false);
    localFS.delete(localURLListIndexPath,false);		
    
    String queryDomain = getQueryData().getDomainName();
    
    if (queryDomain.length() != 0) { 
    
    	String url = "http://" + queryDomain + "/";
    	
    	URLFPV2 fp = URLUtils.getURLFPV2FromURL(url);
    	
    	if (fp != null) { 
    		return index.collectAllTopLevelDomainRecordsByDomain(fileSystem, conf, fp.getRootDomainHash(), localFS, localURLListPath);
    	}
    }
		throw new IOException("Invalid Domain Name:" + queryDomain);
	}
}