/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.queryserver.query; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Map; import java.util.TreeMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.file.tfile.TFile; import org.commoncrawl.async.EventLoop; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.pipelineV1.InverseLinksByDomainDBBuilder.ComplexKeyComparator; import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter; import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.service.queryserver.InlinksByDomainQueryInfo; import org.commoncrawl.service.queryserver.ShardIndexHostNameTuple; import org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex; import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.MasterDatabaseIndex; import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.SlaveDatabaseIndex; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FileUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; /** * * @author rana * */ public class InverseLinksByDomainQuery extends Query<InlinksByDomainQueryInfo,FlexBuffer,URLFPV2>{ private static final Log LOG = LogFactory.getLog(InverseLinksByDomainQuery.class); private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } public InverseLinksByDomainQuery() { } public InverseLinksByDomainQuery(InlinksByDomainQueryInfo queryInfo) { setQueryData(queryInfo); } static Map<Integer,PositionBasedSequenceFileIndex> _shardToIndexMap = new TreeMap<Integer,PositionBasedSequenceFileIndex>(); static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs,Configuration conf,long databaseId,long targetRootDomainFP,FileSystem outputFileSystem,Path finalOutputPath) throws IOException { File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis()); tempFile.mkdir(); try { // create the final output spill writer ... SequenceFileSpillWriter<FlexBuffer,URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer,URLFPV2>( outputFileSystem,conf,finalOutputPath,FlexBuffer.class,URLFPV2.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)) ,true); try { MergeSortSpillWriter<FlexBuffer,URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer,URLFPV2>( conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null, new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class,true,null); try { for (int targetShardId=0;targetShardId<CrawlEnvironment.NUM_DB_SHARDS;++targetShardId) { // 0. shard domain id to find index file location ... int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // build path to index file Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId)); LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath); // 1. scan domainFP to index file first // 2. given index, scan index->pos file to find scan start position // 3. given scan start position, scan forward until fp match is found. // 4. collect all matching entries and output to a file ? FSDataInputStream indexDataInputStream = fs.open(indexFilePath); try { TFile.Reader reader = new TFile.Reader(indexDataInputStream,fs.getFileStatus(indexFilePath).getLen(),conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { // generate key ... DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(targetRootDomainFP); if (scanner.seekTo(keyBuffer.getData(),0,keyBuffer.getLength())) { // setup for value scan DataInputStream valueStream = scanner.entry().getValueStream(); int dataOffsetOut = -1; while (valueStream.available() >0) { // read entries looking for our specific entry int shardIdx = valueStream.readInt(); int dataOffset = valueStream.readInt(); if (shardIdx == targetShardId) { dataOffsetOut = dataOffset; break; } } LOG.info("Index Search Yielded:"+ dataOffsetOut); if (dataOffsetOut != -1) { // ok create a data path Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId)); Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index"); // check to see if index is already loaded ... PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null; synchronized(_shardToIndexMap) { index = _shardToIndexMap.get(targetShardId); } if (index == null) { LOG.info("Loading Index from Path:" + finalDataIndexPath); // load index index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class); // put in cache synchronized (_shardToIndexMap) { _shardToIndexMap.put(targetShardId, index); } } LOG.info("Initializing Data Reader at Path:" + finalDataPath); // ok time to create a reader SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,finalDataPath,conf); try { LOG.info("Seeking Reader to Index Position:" + dataOffsetOut); index.seekReaderToItemAtIndex(dataReader,dataOffsetOut); FlexBuffer keyBytes = new FlexBuffer(); URLFPV2 sourceFP = new URLFPV2(); DataInputBuffer keyReader = new DataInputBuffer(); TextBytes urlTxt = new TextBytes(); // ok read to go ... while (dataReader.next(keyBytes,sourceFP)) { // initialize reader keyReader.reset(keyBytes.get(),keyBytes.getOffset(),keyBytes.getCount()); long targetFP = keyReader.readLong(); if (targetRootDomainFP == targetFP) { finalMerger.spillRecord(keyBytes, sourceFP); } else { LOG.info("FP:"+ targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop"); break; } } } finally { LOG.info("Closing Reader"); dataReader.close(); } } } } finally { LOG.info("Closing Scanner"); scanner.close(); } } finally { LOG.info("Closing TFile Reader"); reader.close(); } } finally { LOG.info("Closing InputStream"); indexDataInputStream.close(); } } } finally { finalMerger.close(); } } finally { spillwriter.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); FileUtils.recursivelyDeleteFile(tempFile); } } public static void main(String[] args) { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); LOG.info("URL:" + args[0] + " ShardId:" + args[1]); try { File tempFile = File.createTempFile("inverseLinksReportTest", "seq"); try { FileSystem fs = FileSystem.get(conf); FileSystem localFileSystem = FileSystem.getLocal(conf); URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]); if (fp != null) { collectAllTopLevelDomainRecordsByDomain(fs,conf,1282844121161L,fp.getRootDomainHash(),localFileSystem,new Path(tempFile.getAbsolutePath())); SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem,new Path(tempFile.getAbsolutePath()) , conf); try { FlexBuffer key = new FlexBuffer(); URLFPV2 src = new URLFPV2(); TextBytes url = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); while (reader.next(key, src)) { inputBuffer.reset(key.get(),key.getOffset(),key.getCount()); long targetFP = inputBuffer.readLong(); float pageRank = inputBuffer.readFloat(); // ok initialize text bytes ... int textLen = WritableUtils.readVInt(inputBuffer); url.set(key.get(), inputBuffer.getPosition(), textLen); LOG.info("PR:"+ pageRank + " URL:" + url.toString()); } } finally { reader.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); // tempFile.delete(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public boolean cachedResultsAvailable( FileSystem fileSystem, Configuration conf, QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> theClientRequest) throws IOException { FileSystem localFileSystem = FileSystem.getLocal(conf); Path urlOutputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest) + "DATA"); LOG.info("Cached Results Available called for Query:" + theClientRequest.getSourceQuery().getQueryId() + ". Checking Path:" + urlOutputFileName); return localFileSystem.exists(urlOutputFileName); } @Override protected long executeRemote( FileSystem fileSyste, Configuration conf, EventLoop eventLoop, SlaveDatabaseIndex instanceIndex, File tempFirDir, QueryProgressCallback<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> progressCallback) throws IOException { // TODO Auto-generated method stub return 0; } @Override public void getCachedResults( FileSystem fileSystem, Configuration conf, EventLoop eventLoop, MasterDatabaseIndex masterIndex, QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> theClientRequest, QueryCompletionCallback<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> callback) throws IOException { FileSystem localFileSystem = FileSystem.getLocal(conf); Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+"DATA"); //LOG.info("Initializing index reader for outputFile:" + outputFileName); Path indexFileName = PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFileName); //LOG.info("Index FileName is:" + indexFileName); PositionBasedSequenceFileIndex<FlexBuffer,URLFPV2> index = new PositionBasedSequenceFileIndex<FlexBuffer,URLFPV2>(localFileSystem,indexFileName,FlexBuffer.class,URLFPV2.class); QueryResult<FlexBuffer,URLFPV2> resultOut = new QueryResult<FlexBuffer,URLFPV2>(); LOG.info("getCachedResults called for Query:" + getQueryId() +" Calling ReadPaginationResults"); index.readPaginatedResults(localFileSystem, conf, theClientRequest.getClientQueryInfo().getSortOrder(), theClientRequest.getClientQueryInfo().getPaginationOffset(), theClientRequest.getClientQueryInfo().getPageSize(), resultOut); LOG.info("getCachedResults called for Query:" + getQueryId() +". Initiating getCachedResults Callback"); callback.queryComplete(theClientRequest,resultOut); } @Override public String getCanonicalId() { return encodePatternAsFilename("ILBD:" + getQueryData().getDomainName()); } @Override public boolean requiresRemoteDispatch( FileSystem fileSystem, Configuration conf, ShardMapper shardMapper, QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> theClientRequest, ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping) throws IOException { return false; } @Override protected long executeLocal( FileSystem fileSystem, Configuration conf, MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir, QueryRequest<InlinksByDomainQueryInfo, FlexBuffer,URLFPV2> requestObject) throws IOException { LocalFileSystem localFS = FileSystem.getLocal(conf); Path localURLListPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+"DATA"); Path localURLListIndexPath = new Path(getLocalQueryResultsPathPrefix(requestObject)+"DATA.index"); LOG.info("executeLocal called. Domain:" + getQueryData().getDomainName() + " cacheFilename:" + localURLListPath); localFS.delete(localURLListPath,false); localFS.delete(localURLListIndexPath,false); String queryDomain = getQueryData().getDomainName(); if (queryDomain.length() != 0) { String url = "http://" + queryDomain + "/"; URLFPV2 fp = URLUtils.getURLFPV2FromURL(url); if (fp != null) { return index.collectAllTopLevelDomainRecordsByDomain(fileSystem, conf, fp.getRootDomainHash(), localFS, localURLListPath); } } throw new IOException("Invalid Domain Name:" + queryDomain); } }