/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.queryserver.index; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeMap; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.ValueBytes; import org.apache.hadoop.io.file.tfile.TFile; import org.apache.log4j.BasicConfigurator; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.pipelineV1.MetadataIndexBuilderV2; import org.commoncrawl.mapred.pipelineV1.InverseLinksByDomainDBBuilder.ComplexKeyComparator; import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter; import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator; import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter; import org.commoncrawl.protocol.CrawlDatumAndMetadata; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.protocol.SubDomainMetadata; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.service.queryserver.ShardIndexHostNameTuple; import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.MasterDatabaseIndex.MetadataOut; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CompressURLListV2; import org.commoncrawl.util.CompressedURLFPListV2; import org.commoncrawl.util.CrawlDatum; import org.commoncrawl.util.FileUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.NodeAffinityMaskBuilder; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.Tuples.TriTextBytesTuple; import org.commoncrawl.util.URLUtils.URLFPV2RawComparator; import com.google.common.collect.TreeMultimap; public class DatabaseIndexV2 { private static final Log LOG = LogFactory.getLog(DatabaseIndexV2.class); public static class MasterDatabaseIndex { Configuration _conf; FileSystem _remoteFS; int _driveCount; long _databaseTimestamp; ArrayList<ShardIndexHostNameTuple> _urlfpIndex_ShardMap = null; ArrayList<ShardIndexHostNameTuple> _stringToSubDomainIndex_ShardMap = null; public static String INDEX_NAME_URLFPV2 = "URLFP_INDEX"; public static String INDEX_NAME_DOMAIN_NAME_TO_METADATA = "DOMAIN_NAME_TO_METADATA"; public static String INDEX_NAME_DOMAIN_ID_TO_METADATA = "DOMAIN_ID_TO_METADATA"; public static String INDEX_NAME_DOMAIN_ID_TO_URLLIST_SORTED_BY_NAME = "DOMAIN_ID_TO_URLLIST_BY_NAME"; public static String INDEX_NAME_DOMAIN_ID_TO_URLLIST_SORTED_BY_PR = "DOMAIN_ID_TO_URLLIST_BY_PR"; public static String INDEX_NAME_OUTLINK_DATA = "OUTLINK_DATA"; public static String INDEX_NAME_INLINK_DATA = "INLINK_DATA"; private Map<String, ArrayList<ShardIndexHostNameTuple>> _indexToShardMapping = new TreeMap<String, ArrayList<ShardIndexHostNameTuple>>(); public MasterDatabaseIndex(Configuration conf, FileSystem remoteFS, int driveCount, long databaseTimestamp,Set<String> slavesList) throws IOException { _conf = conf; _remoteFS = remoteFS; _driveCount = driveCount; _databaseTimestamp = databaseTimestamp; // ok populate affinity map indexes _indexToShardMapping.put(INDEX_NAME_URLFPV2, buildShardMapFromAffinityMapGivenRootPath(new Path( "crawl/linkdb/merged" + _databaseTimestamp + "/linkMetadata"),slavesList)); _indexToShardMapping.put(INDEX_NAME_OUTLINK_DATA, buildShardMapFromAffinityMapGivenRootPath(new Path( "crawl/linkdb/merged" + _databaseTimestamp + "/linkData"),slavesList)); _indexToShardMapping .put(INDEX_NAME_INLINK_DATA, buildShardMapFromAffinityMapGivenRootPath(new Path( "crawl/inverse_linkdb/merged" + _databaseTimestamp + "/linkData"),slavesList)); _indexToShardMapping.put(INDEX_NAME_DOMAIN_NAME_TO_METADATA, buildShardMapFromAffinityMapGivenRootPath(new Path( "crawl/metadatadb/" + _databaseTimestamp + "/subDomainMetadata/" + MetadataIndexBuilderV2.SUBDOMAIN_INDEX_NAME_TO_METADATA),slavesList)); _indexToShardMapping.put(INDEX_NAME_DOMAIN_ID_TO_METADATA, buildShardMapFromAffinityMapGivenRootPath(new Path( "crawl/metadatadb/" + _databaseTimestamp + "/subDomainMetadata/" + MetadataIndexBuilderV2.SUBDOMAIN_INDEX_ID_TO_METADATA),slavesList)); _indexToShardMapping.put(INDEX_NAME_DOMAIN_ID_TO_URLLIST_SORTED_BY_NAME, buildShardMapFromAffinityMapGivenRootPath(new Path( "crawl/querydb/db/" + _databaseTimestamp + "/indexedByURL"),slavesList)); _indexToShardMapping.put(INDEX_NAME_DOMAIN_ID_TO_URLLIST_SORTED_BY_PR, buildShardMapFromAffinityMapGivenRootPath(new Path( "crawl/querydb/db/" + _databaseTimestamp + "/indexedByPR"),slavesList)); } static ThreadLocal<NumberFormat> _numberFormat = new ThreadLocal<NumberFormat>() { protected NumberFormat initialValue() { NumberFormat formatOut = NumberFormat .getInstance(); formatOut .setMinimumIntegerDigits(5); formatOut .setGroupingUsed(false); return formatOut; }; }; /** * return an shardIdToHost mapping given an index name * * @param indexName * @return * @throws IOException */ public final ArrayList<ShardIndexHostNameTuple> mapShardIdsForIndex( String indexName) throws IOException { return _indexToShardMapping.get(indexName); } public final TextBytes queryURLGivenURLFP(URLFPV2 fingerprint) throws IOException { // establish parition id int paritionId = (fingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS; // estalish drive index ... int driveIndex = paritionId % _driveCount; // establish path .. Path indexPath = new Path("/data/" + driveIndex + "/urldb/" + _databaseTimestamp + "/part-" + _numberFormat.get().format(paritionId) + ".index"); CompressURLListV2.Index.IndexFile index = new CompressURLListV2.Index.IndexFile( new File(indexPath.toString())); return index.mapURLFPToURL(fingerprint, null); } public static class MetadataOut { // the url component public TextBytes url = new TextBytes(); // the optimized data components // page rank public float pageRank = 0.0f; // fetch status public byte fetchStatus = -1; // protocol status public byte protocolStatus = -1; // fetch time public long lastFetchTime = -1; // and finally, optionally, the datum and metadata structure public TextBytes datumAndMetadataBytes = new TextBytes(); } public final MetadataOut queryMetadataAndURLGivenFP(URLFPV2 fingerprint) throws IOException { FileSystem localFileSystem = FileSystem.getLocal(_conf); DataOutputBuffer keyData = new DataOutputBuffer(); keyData.writeLong(fingerprint.getDomainHash()); keyData.writeLong(fingerprint.getUrlHash()); // establish parition id int paritionId = (fingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS; // estalish drive index ... int driveIndex = paritionId % _driveCount; // establish path .. Path indexPath = new Path("/data/" + driveIndex + "/metadata/" + _databaseTimestamp + "/part-" + _numberFormat.get().format(paritionId) + ".index"); CompressURLListV2.Index.IndexFile index = new CompressURLListV2.Index.IndexFile( new File(indexPath.toString())); TextBytes dataOut = index.mapURLFPToURL(fingerprint, null); if (dataOut != null) { DataInputBuffer readerStream = new DataInputBuffer(); readerStream.reset(dataOut.getBytes(), dataOut.getOffset(), dataOut .getLength()); MetadataOut metadataOut = new MetadataOut(); // LOG.info("**Data Length:" + dataOut.getLength()); int urlLength = WritableUtils.readVInt(readerStream); // LOG.info("**URL BYTES Length:" + urlLength + " ReaderPos:" + // readerStream.getPosition()); // set text bytes metadataOut.url.set(dataOut.getBytes(), readerStream.getPosition(), urlLength); // advance past url bytes. readerStream.skip(urlLength); int otherBytes = WritableUtils.readVInt(readerStream); // LOG.info("**OTHER BYTES Length:" + otherBytes); // ok see if other bytes is valid if (otherBytes != 0) { // ok read optimized data metadataOut.pageRank = readerStream.readFloat(); metadataOut.fetchStatus = readerStream.readByte(); metadataOut.protocolStatus = readerStream.readByte(); metadataOut.lastFetchTime = readerStream.readLong(); } // ok read in metadata length int metadataBytes = WritableUtils.readVInt(readerStream); // LOG.info("**METADATA BYTES Length:" + metadataBytes); // IFF metadata is presnet and a full read is requested ... if (metadataBytes != 0) { // read metadata metadataOut.datumAndMetadataBytes.set(dataOut.getBytes(), readerStream.getPosition(), metadataBytes); } return metadataOut; } return null; } public void bulkQueryURLAndMetadataGivenInputStream(FileSystem remoteFS, Configuration conf, File tempFileDir, FlexBuffer linkDataBuffer, MergeSortSpillWriter<TextBytes, TriTextBytesTuple> merger) throws IOException { FileSystem localFS = FileSystem.getLocal(conf); // delete contents for temp dir localFS.delete(new Path(tempFileDir.getAbsolutePath()), true); // make it again tempFileDir.mkdir(); // read link data input stream and populate map // create a multimap ... sort by shard id ... TreeMultimap<Integer, URLFPV2> shardedFingerprintList = TreeMultimap .create(); // initialize stream ... DataInputBuffer linkDataInputStream = new DataInputBuffer(); linkDataInputStream.reset(linkDataBuffer.get(), linkDataBuffer .getOffset(), linkDataBuffer.getCount()); // initialize fingerprint list reader CompressedURLFPListV2.Reader fplistReader = new CompressedURLFPListV2.Reader( linkDataInputStream); try { // walk fingerprints while (fplistReader.hasNext()) { // read next fingerprint URLFPV2 nextFP = fplistReader.next(); // ok compute shard index int shardId = ((nextFP.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // ok add it to multimap based on shard id shardedFingerprintList.put(shardId, nextFP); } } finally { // close reader fplistReader.close(); } // ok now walk fingerprints sorted by shard id LOG.info("Walking fingerprints based on shard id "); // walk shard entries one at a time for (int shardId : shardedFingerprintList.keySet()) { // get fingerprints specific to this shard ... SortedSet<URLFPV2> fingerprintsForShard = shardedFingerprintList .get(shardId); // read url count for shard int urlCount = fingerprintsForShard.size(); LOG.info("Shard Id:" + shardId + " URLCount:" + urlCount); // open up stream // estalish drive index ... int driveIndex = shardId % _driveCount; // establish path .. Path indexPath = new Path("/data/" + driveIndex + "/metadata/" + _databaseTimestamp + "/part-" + _numberFormat.get().format(shardId) + ".index"); DataInputBuffer inputStream = new DataInputBuffer(); CompressURLListV2.Index.IndexFile index = new CompressURLListV2.Index.IndexFile( new File(indexPath.toString())); CompressURLListV2.IndexCursor cursor = new CompressURLListV2.IndexCursor(); TextBytes urlValueOut = new TextBytes(); for (URLFPV2 fingerprint : fingerprintsForShard) { TextBytes dataOut = index.mapURLFPToURL(fingerprint, cursor); if (dataOut != null) { inputStream.reset(dataOut.getBytes(), dataOut.getOffset(), dataOut .getLength()); // data is a tri-text byte tuple object ... TriTextBytesTuple tuple = new TriTextBytesTuple(); // read tuple tuple.readFields(inputStream); // transfer url to key value urlValueOut.set(tuple.getFirstValue().getBytes(), 0, tuple .getFirstValue().getLength()); // reset tuple's url value tuple.getFirstValue().clear(); // and spill it to merger merger.spillRecord(urlValueOut, tuple); } else { LOG.error("Failed to retrieve Metadata for Shard:" + shardId + " FP:" + fingerprint.getUrlHash()); } } } } /** * query subdomain metadata given domain id * * @param domainId * @return * @throws IOException */ public SubDomainMetadata queryDomainMetadataGivenDomainId(long domainId) throws IOException { // figure out shard id based on key int shardId = (((int) domainId) & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS; // write key to buffer DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(domainId); FlexBuffer key = queryDomainMetadataKeyAndIndex(keyBuffer, MetadataIndexBuilderV2.SUBDOMAIN_INDEX_ID_TO_METADATA, shardId); if (key != null) { DataInputBuffer inputStream = new DataInputBuffer(); inputStream.reset(key.get(), 0, key.getCount()); SubDomainMetadata metadataOut = new SubDomainMetadata(); metadataOut.readFields(inputStream); return metadataOut; } return null; } /** * query subdomain metadata given domain name * * @param domainName * @return * @throws IOException */ public SubDomainMetadata queryDomainMetadataGivenDomainName( String domainName) throws IOException { long domainID = queryDomainIdGivenDomain(domainName); return queryDomainMetadataGivenDomainId(domainID); } private FlexBuffer queryDomainMetadataKeyAndIndex(DataOutputBuffer keyData, String indexName, int shardId) throws IOException { FileSystem localFS = FileSystem.getLocal(_conf); // figure out shard id ... // int shardId = (((int)domainId) & Integer.MAX_VALUE) % // CrawlEnvironment.NUM_DB_SHARDS; // figure out drive index ... int driveIndex = shardId % _driveCount; // construct index path .. Path filePath = new Path("/data/" + driveIndex + "/subDomain_" + indexName + "/" + _databaseTimestamp + "/part-" + _numberFormat.get().format(shardId)); // open file FSDataInputStream inputStream = localFS.open(filePath); try { TFile.Reader reader = new TFile.Reader(inputStream, localFS .getFileStatus(filePath).getLen(), _conf); try { // scanner TFile.Reader.Scanner scanner = reader.createScanner(); try { // seek to key if (scanner.seekTo(keyData.getData(), 0, keyData.getLength())) { BytesWritable dataOut = new BytesWritable(); // ok return raw data scanner.entry().getValue(dataOut); // and return it return new FlexBuffer(dataOut.getBytes(), 0, dataOut.getLength()); } } finally { scanner.close(); } } finally { reader.close(); } } finally { inputStream.close(); } return null; } public final URLFPV2 queryFPGivenURL(String url) throws IOException { URLFPV2 fp = URLUtils.getURLFPV2FromURL(url); if (fp == null) { throw new IOException("Malformed URL Exception"); } return fp; } public final int queryShardIdGivenFP(URLFPV2 fingerprint) { return (fingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS; } public final long queryDomainIdGivenDomain(String domain) throws IOException { String domainHack = "http://" + domain; URLFPV2 hackFP = URLUtils.getURLFPV2FromURL(domainHack); if (hackFP == null) { throw new IOException("Malformed Domain Name Exception"); } else { return hackFP.getDomainHash(); } } public final long queryDomainShardIdGivenDomain(long domain) throws IOException { return (((int) domain) & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS; } private ArrayList<ShardIndexHostNameTuple> buildShardMapFromAffinityMapGivenRootPath( Path rootPath,Set<String> optionalSlavesList) throws IOException { // Path linkDBPath = new Path("crawl/linkdb/merged" + _databaseTimestamp + // "/linkMetadata"); String affinityMapStr = NodeAffinityMaskBuilder.buildNodeAffinityMask( _remoteFS, rootPath, null); if (affinityMapStr == null) { throw new IOException("Unable to create node affinity mask for path:" + rootPath); } else { Map<Integer, String> affinityMap = NodeAffinityMaskBuilder .parseAffinityMask(affinityMapStr); // ok if a slaves list is supplied ... if (optionalSlavesList != null) { HashSet<String> excludedNodes = new HashSet<String>(); for (Map.Entry<Integer, String> entry : affinityMap.entrySet()) { if (!optionalSlavesList.contains(entry.getValue())) { LOG.warn("Slave:" + entry.getValue() + " for parition:" + entry.getKey() + " not available!"); excludedNodes.add(entry.getValue()); } } // now if exclusion set is not empty, this means the affinity map // contains nodes that are not available ... rebuild it again with an // exlusion list if (excludedNodes.size() != 0) { LOG.warn("Affinity map will be rebuilt with excluded nodes:" + excludedNodes.toString()); affinityMapStr = NodeAffinityMaskBuilder.buildNodeAffinityMask( _remoteFS, rootPath, null,excludedNodes); if (affinityMapStr == null) { throw new IOException("Unable to create node affinity mask for path:" + rootPath); } else { affinityMap = NodeAffinityMaskBuilder.parseAffinityMask(affinityMapStr); } } } // ok build host name to shard id tuple ArrayList<ShardIndexHostNameTuple> tupleListOut = new ArrayList<ShardIndexHostNameTuple>(); // ok now build an actual affinity map for (Map.Entry<Integer, String> entry : affinityMap.entrySet()) { // create a tuple ShardIndexHostNameTuple tuple = new ShardIndexHostNameTuple(); String hostName = entry.getValue(); // strip everything except leading qualifier int indexOfDot = hostName.indexOf('.'); if (indexOfDot != -1) { hostName = hostName.substring(0, indexOfDot); } tuple.setHostName(hostName); tuple.setShardId(entry.getKey()); tupleListOut.add(tuple); } return tupleListOut; } } private static final NumberFormat NUMBER_FORMAT = NumberFormat .getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } static Map<Integer, PositionBasedSequenceFileIndex> _shardToInverseDomainQueryIndexMap = new TreeMap<Integer, PositionBasedSequenceFileIndex>(); public long collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException { File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis()); tempFile.mkdir(); long recordCount = 0; try { LOG.info("Opening SpillWriter @:" + finalOutputPath + " using FileSystem:" + outputFileSystem.toString()); // create the final output spill writer ... SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>( outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter( outputFileSystem, PositionBasedSequenceFileIndex .getIndexNameFromBaseName(finalOutputPath)), true); try { MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>( conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile .getAbsolutePath()), null, new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null); try { for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) { // 0. shard domain id to find index file location ... int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // build path to index file Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + _databaseTimestamp + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId)); LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath); // 1. scan domainFP to index file first // 2. given index, scan index->pos file to find scan start // position // 3. given scan start position, scan forward until fp match is // found. // 4. collect all matching entries and output to a file ? FSDataInputStream indexDataInputStream = fs.open(indexFilePath); try { TFile.Reader reader = new TFile.Reader(indexDataInputStream, fs .getFileStatus(indexFilePath).getLen(), conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { // generate key ... DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(targetRootDomainFP); if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer .getLength())) { // setup for value scan DataInputStream valueStream = scanner.entry() .getValueStream(); int dataOffsetOut = -1; while (valueStream.available() > 0) { // read entries looking for our specific entry int shardIdx = valueStream.readInt(); int dataOffset = valueStream.readInt(); if (shardIdx == targetShardId) { dataOffsetOut = dataOffset; break; } } LOG.info("Index Search Yielded:" + dataOffsetOut); if (dataOffsetOut != -1) { // ok create a data path Path finalDataPath = new Path( "crawl/inverseLinkDB_ByDomain/" + _databaseTimestamp + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId)); Path finalDataIndexPath = new Path( "crawl/inverseLinkDB_ByDomain/" + _databaseTimestamp + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index"); // check to see if index is already loaded ... PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null; synchronized (_shardToInverseDomainQueryIndexMap) { index = _shardToInverseDomainQueryIndexMap .get(targetShardId); } if (index == null) { LOG.info("Loading Index from Path:" + finalDataIndexPath); // load index index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>( fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class); // put in cache synchronized (_shardToInverseDomainQueryIndexMap) { _shardToInverseDomainQueryIndexMap.put( targetShardId, index); } } LOG.info("Initializing Data Reader at Path:" + finalDataPath); // ok time to create a reader SequenceFile.Reader dataReader = new SequenceFile.Reader( fs, finalDataPath, conf); try { LOG.info("Seeking Reader to Index Position:" + dataOffsetOut); index.seekReaderToItemAtIndex(dataReader, dataOffsetOut); FlexBuffer keyBytes = new FlexBuffer(); URLFPV2 sourceFP = new URLFPV2(); DataInputBuffer keyReader = new DataInputBuffer(); // ok read to go ... while (dataReader.next(keyBytes, sourceFP)) { // initialize reader keyReader.reset(keyBytes.get(), keyBytes .getOffset(), keyBytes.getCount()); long targetFP = keyReader.readLong(); if (targetRootDomainFP == targetFP) { finalMerger.spillRecord(keyBytes, sourceFP); ++recordCount; } else { LOG.info("FP:" + targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop"); break; } } } finally { LOG.info("Closing Reader"); dataReader.close(); } } } } finally { LOG.info("Closing Scanner"); scanner.close(); } } finally { LOG.info("Closing TFile Reader"); reader.close(); } } finally { LOG.info("Closing InputStream"); indexDataInputStream.close(); } } } finally { LOG.info("Closing Final Merger"); finalMerger.close(); } } finally { LOG.info("Closing Final SpillWriter"); spillwriter.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); FileUtils.recursivelyDeleteFile(tempFile); throw e; } return recordCount; } } public static class SlaveDatabaseIndex { Configuration _conf; FileSystem _fs; long _databaseTimestamp; int _shardIds[]; static ThreadLocal<NumberFormat> _numberFormat = new ThreadLocal<NumberFormat>() { protected NumberFormat initialValue() { NumberFormat formatOut = NumberFormat .getInstance(); formatOut .setMinimumIntegerDigits(5); formatOut .setGroupingUsed(false); return formatOut; }; }; public SlaveDatabaseIndex(Configuration conf, FileSystem remoteFS, long databaseTimestamp) throws IOException { _conf = conf; _fs = remoteFS; _databaseTimestamp = databaseTimestamp; } public FlexBuffer queryURLListSortedByName(long domainFP) throws IOException { return queryURLList(domainFP, "indexedByURL"); } public FlexBuffer queryURLListSortedByPR(long domainFP) throws IOException { return queryURLList(domainFP, "indexedByPR"); } private static final long MAX_SPILLBUFFER_ITEM_COUNT = (1 << 27) /8; private FlexBuffer queryURLList(long domainFP, String indexName) throws IOException { // figure out shard index ... int shardIndex = (((int) domainFP) & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS; // calculate paths ... Path indexPath = new Path("crawl/querydb/db/" + _databaseTimestamp + "/" + indexName + "/part-" + _numberFormat.get().format(shardIndex)); Path indexDataPath = new Path("crawl/querydb/db/" + _databaseTimestamp + "/" + indexName + "/IndexData" + _numberFormat.get().format(shardIndex)); FSDataInputStream indexInputStream = _fs.open(indexPath); try { FSDataInputStream indexDataInputStream = _fs.open(indexDataPath); try { TFile.Reader reader = new TFile.Reader(indexInputStream, _fs .getFileStatus(indexPath).getLen(), _conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(domainFP); if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) { // establish data start .. long dataPosStart = scanner.entry().getValueStream().readLong(); // now establish default end pos long dataPosEnd = _fs.getFileStatus(indexDataPath).getLen(); // and if not last index item .. use next item as stop point if (scanner.advance()) { dataPosEnd = scanner.entry().getValueStream().readLong(); } // calculate size long dataSize = dataPosEnd - dataPosStart; long itemCount = dataSize / 8; if (itemCount > MAX_SPILLBUFFER_ITEM_COUNT) { LOG.error("itemCount:"+ itemCount + " exceeds MAX_SPILLBUFFER_SIZE:" + MAX_SPILLBUFFER_ITEM_COUNT + " truncating."); dataSize = MAX_SPILLBUFFER_ITEM_COUNT * 8; } // all right .. we are read to spill out the fingerprints .. FlexBuffer bufferOut = new FlexBuffer( new byte[(int)dataSize]); // seek to proper location ... indexDataInputStream.seek(dataPosStart); // and read entire contents .. indexDataInputStream.read(bufferOut.get()); return bufferOut; } } finally { scanner.close(); } } finally { reader.close(); } } finally { indexDataInputStream.close(); } } finally { indexInputStream.close(); } return null; } public FlexBuffer queryOutlinksByFP(URLFPV2 fingerprint, int shardId, long dataPos) throws IOException { return queryLinkDataByFP("linkdb", fingerprint, shardId, dataPos); } public FlexBuffer queryInlinksByFP(URLFPV2 fingerprint, int shardId, long dataPos) throws IOException { return queryLinkDataByFP("inverse_linkdb", fingerprint, shardId, dataPos); } private FlexBuffer queryLinkDataByFP(String indexName, URLFPV2 fingerprint, int shardId, long dataPos) throws IOException { // write out incoing fingerprint in a buffer DataOutputBuffer queryBuffer = new DataOutputBuffer(); fingerprint.write(queryBuffer); URLFPV2RawComparator comparator = new URLFPV2RawComparator(); // establish path ... Path linkDataPath = new Path("crawl/" + indexName + "/merged" + _databaseTimestamp + "/linkData/part-" + _numberFormat.get().format(shardId)); LOG.info("linkDataPath:" + linkDataPath); for (int pass = 0; pass < 2; ++pass) { // open file SequenceFile.Reader reader = new SequenceFile.Reader(_fs, linkDataPath, _conf); try { if (pass == 1) { reader.sync(dataPos); LOG.info("New Position is:" + reader.getPosition()); } else { reader.seek(dataPos); } LOG.info("Seek DataPos:" + dataPos + " NewPos:" + reader.getPosition()); boolean eos = false; DataOutputBuffer keyBuffer = new DataOutputBuffer(); ValueBytes valueBytes = reader.createValueBytes(); int itemsRead = 0; boolean doSecondPass = false; while (!eos) { keyBuffer.reset(); if (reader.nextRaw(keyBuffer, valueBytes) == -1) { LOG.error("Next Raw Failed"); eos = true; } else { itemsRead++; // URLFPV2 currentKeyDbg = new URLFPV2(); // DataInputBuffer inputTemp = new DataInputBuffer(); // inputTemp.reset(keyBuffer.getData(),0,keyBuffer.getLength()); // currentKeyDbg.readFields(inputTemp); // ok compare fingerprints ... int result = comparator.compare(queryBuffer.getData(), 0, queryBuffer.getLength(), keyBuffer.getData(), 0, keyBuffer .getLength()); // LOG.info("***SEARCHING FOR DomainFP:" + // fingerprint.getDomainHash() + " URLFP:" + // fingerprint.getUrlHash() // + " Got: DomainFP:"+ currentKeyDbg.getDomainHash() + " URLFP:" // + currentKeyDbg.getUrlHash() + " CompareResult:"+ result); if (result == 0) { // ok match found !!! DataOutputBuffer valueDataOut = new DataOutputBuffer(); valueBytes.writeUncompressedBytes(valueDataOut); // skip the first four bytes length field of the container // byteswritable return new FlexBuffer(valueDataOut.getData(), 4, valueDataOut .getLength() - 4); } else if (result == -1) { DataInputBuffer inputStream = new DataInputBuffer(); inputStream .reset(keyBuffer.getData(), 0, keyBuffer.getLength()); URLFPV2 otherFP = new URLFPV2(); otherFP.readFields(inputStream); LOG.error("***Failed to Find Match. ItemsRead:" + itemsRead + " QueryDH:" + fingerprint.getDomainHash() + " FP:" + fingerprint.getUrlHash() + " lastDH:" + otherFP.getDomainHash() + " FP:" + otherFP.getUrlHash()); if (itemsRead == 1) { if (dataPos != 0) { dataPos = Math.max(0, dataPos - _fs.getFileStatus(linkDataPath).getBlockSize()); LOG.info("Retrying with new data pos of:" + dataPos); doSecondPass = true; } } eos = true; } } } if (!doSecondPass) { break; } else { LOG.info("*** Doing Second Pass with BlockPos:" + dataPos); } } finally { reader.close(); } } return null; } @SuppressWarnings("unchecked") public long queryDomainsGivenPattern(String searchPattern, int shardId, SequenceFileSpillWriter<Text, SubDomainMetadata> spillWriter) throws IOException { Path metadataDBPath = new Path("crawl/metadatadb/" + _databaseTimestamp + "/subDomainMetadata/" + MetadataIndexBuilderV2.SUBDOMAIN_INDEX_NAME_TO_METADATA + "/part-" + _numberFormat.get().format(shardId)); Pattern patternObj = Pattern.compile(searchPattern); FSDataInputStream inputStream = _fs.open(metadataDBPath); long indexLength = _fs.getFileStatus(metadataDBPath).getLen(); long recordCount = 0; try { TFile.Reader reader = new TFile.Reader(inputStream, indexLength, _conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { BytesWritable keyBytes = new BytesWritable(); DataInputBuffer keyStream = new DataInputBuffer(); TextBytes textBytes = new TextBytes(); while (!scanner.atEnd()) { // get key bytes ... scanner.entry().getKey(keyBytes); // reset stream keyStream.reset(keyBytes.getBytes(), keyBytes.getLength()); // read text bytes length int textBytesLength = WritableUtils.readVInt(keyStream); // initialize text bytes to remaining bytes textBytes.set(keyBytes.getBytes(), keyStream.getPosition(), keyStream .getLength() - keyStream.getPosition()); // decode ... String domainName = textBytes.toString(); // match if (patternObj.matcher(domainName).matches()) { // IFF MATCH, GET VALUE BYTES BytesWritable valueBytes = new BytesWritable(); scanner.entry().getValue(valueBytes); // SPILL spillWriter.spillRawRecord(keyBytes.getBytes(), 0, keyBytes .getLength(), valueBytes.getBytes(), 0, valueBytes.getLength()); // increment record count recordCount++; } scanner.advance(); } } finally { scanner.close(); } } finally { reader.close(); } } finally { inputStream.close(); } return recordCount; } } private static void spillLinkDataIntoTempFileIndex( FileSystem remoteFileSystem, FileSystem localFileSystem, Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, File tempFilePath, Path outputFilePath, FlexBuffer linkData) throws IOException { SequenceFileSpillWriter<TextBytes, TriTextBytesTuple> outputWriter = new SequenceFileSpillWriter<TextBytes, TriTextBytesTuple>( localFileSystem, conf, outputFilePath, TextBytes.class, TriTextBytesTuple.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter( localFileSystem, PositionBasedSequenceFileIndex .getIndexNameFromBaseName(outputFilePath)), true); try { // ok create merge sort spill writer ... MergeSortSpillWriter<TextBytes, TriTextBytesTuple> merger = new MergeSortSpillWriter<TextBytes, TriTextBytesTuple>( conf, outputWriter, localFileSystem, new Path(tempFilePath .getAbsolutePath()), null, new RawKeyValueComparator<TextBytes, TriTextBytesTuple>() { DataInputBuffer stream1 = new DataInputBuffer(); DataInputBuffer stream2 = new DataInputBuffer(); TriTextBytesTuple tuple1 = new TriTextBytesTuple(); TriTextBytesTuple tuple2 = new TriTextBytesTuple(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { stream1.reset(value1Data, value1Offset, value1Length); stream2.reset(value2Data, value2Offset, value2Length); // ok skip url int url1Length = WritableUtils.readVInt(stream1); stream1.skip(url1Length); int url2Length = WritableUtils.readVInt(stream2); stream2.skip(url2Length); // ok now read optimized page rank stuffed in second tuple WritableUtils.readVInt(stream1); WritableUtils.readVInt(stream2); // now read page rank float pageRank1 = stream1.readFloat(); float pageRank2 = stream2.readFloat(); return (pageRank1 == pageRank2) ? 0 : (pageRank1 < pageRank2) ? -1 : 1; } @Override public int compare(TextBytes key1, TriTextBytesTuple value1, TextBytes key2, TriTextBytesTuple value2) { stream1.reset(value1.getSecondValue().getBytes(), value1 .getSecondValue().getLength()); stream2.reset(value2.getSecondValue().getBytes(), value2 .getSecondValue().getLength()); try { float pr1 = stream1.readFloat(); float pr2 = stream2.readFloat(); return (pr1 == pr2) ? 0 : pr1 < pr2 ? -1 : 1; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); throw new RuntimeException(); } } }, TextBytes.class, TriTextBytesTuple.class, false, null); try { long timeStart = System.currentTimeMillis(); System.out.println(".Running Merger against to resolve tuple set "); index.bulkQueryURLAndMetadataGivenInputStream(remoteFileSystem, conf, tempFilePath, linkData, merger); long timeEnd = System.currentTimeMillis(); LOG.info(".Merged Successfully in:" + (timeEnd - timeStart)); } finally { LOG.info("Closing Merger"); merger.close(); } } finally { LOG.info("Closing Writer"); outputWriter.close(); } } public static void main(String[] args) { if (args.length != 3) { LOG.error("args: [candidate Timestamp] [drive count] [query string]"); } // initialize ... final Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); BasicConfigurator.configure(); CrawlEnvironment.setHadoopConfig(conf); long candidateTS = Long.parseLong(args[0]); int driveCount = Integer.parseInt(args[1]); String queryString = args[2]; try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); MasterDatabaseIndex masterIndex = new MasterDatabaseIndex(conf, fs, driveCount, candidateTS,null); SlaveDatabaseIndex slaveIndex = new SlaveDatabaseIndex(conf, fs, candidateTS); // ok hit the domain against the master index first ... LOG.info("Querying master index for DomainId Given DomainName:" + queryString); long domainId = masterIndex.queryDomainIdGivenDomain(queryString); LOG.info("Querying master index for DomainMetadata Given DomainId:" + domainId); SubDomainMetadata subDomainMeta = masterIndex .queryDomainMetadataGivenDomainId(domainId); if (subDomainMeta != null) { LOG.info("Metadata is present. Deserializing"); // dump some fields ... LOG.info("Domain:" + subDomainMeta.getDomainText() + " URLCount:" + subDomainMeta.getUrlCount() + " FetchedCount:" + subDomainMeta.getFetchedCount() + " PageRankCount:" + subDomainMeta.getHasPageRankCount()); // ok time to dive into a url list ... // query for a list of urls sorted by name LOG.info("Querying for URLList for Domain BY PR"); FlexBuffer urlListBufferByPR = slaveIndex .queryURLListSortedByPR(domainId); if (urlListBufferByPR != null) { // read the list ... DataInputBuffer readerStream = new DataInputBuffer(); readerStream.reset(urlListBufferByPR.get(), urlListBufferByPR .getCount()); int totalItemCount = urlListBufferByPR.getCount() / 8; System.out.println("List BY PR totalCount:" + totalItemCount); // initialize a fingerprint object to use for queries ... URLFPV2 queryFP = new URLFPV2(); queryFP.setDomainHash(domainId); DataInputBuffer metadataReaderStream = new DataInputBuffer(); // iterate the first N items ranked by page rank for (int i = 0; i < Math.min(10, totalItemCount); ++i) { queryFP.setUrlHash(readerStream.readLong()); // and for metadata MetadataOut urlMetadata = masterIndex .queryMetadataAndURLGivenFP(queryFP); if (urlMetadata != null) { // decode the url String url = urlMetadata.url.toString(); System.out.println("URL for FP:" + queryFP.getUrlHash() + " is:" + url); if (urlMetadata.datumAndMetadataBytes.getLength() == 0) { System.out.println("URL for FP:" + queryFP.getUrlHash() + " had no METADATA!!"); } else { // explode metadata CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata(); metadataReaderStream.reset(urlMetadata.datumAndMetadataBytes .getBytes(), urlMetadata.datumAndMetadataBytes.getOffset(), urlMetadata.datumAndMetadataBytes.getLength()); metadataObject.readFields(metadataReaderStream); // ok at this point spit out stuff for this url StringBuilder urlInfo = new StringBuilder(); urlInfo.append(" FetchStatus:" + CrawlDatum.getStatusName(metadataObject.getStatus()) + "\n"); urlInfo.append(" PageRank:" + metadataObject.getMetadata().getPageRank() + "\n"); urlInfo.append(" ContentType:" + metadataObject.getMetadata().getContentType() + "\n"); urlInfo.append(" ArcFileInfoCount:" + metadataObject.getMetadata().getArchiveInfo().size()); if (metadataObject.getMetadata().isFieldDirty( CrawlURLMetadata.Field_LINKDBFILENO)) { urlInfo.append(" HasLinkDataInfo:" + metadataObject.getMetadata().getLinkDBFileNo() + ":" + metadataObject.getMetadata().getLinkDBOffset()); } if (metadataObject.getMetadata().isFieldDirty( CrawlURLMetadata.Field_INVERSEDBFILENO)) { urlInfo.append(" HasINVLinkDataInfo:" + metadataObject.getMetadata().getInverseDBFileNo() + ":" + metadataObject.getMetadata().getInverseDBOffset()); } System.out.println(urlInfo.toString()); // now if inverse link data is present .. if (metadataObject.getMetadata().isFieldDirty( CrawlURLMetadata.Field_INVERSEDBFILENO)) { // get it ... System.out.println("Querying for Inlinks for FP:" + queryFP.getUrlHash()); FlexBuffer inlinks = slaveIndex.queryInlinksByFP(queryFP, metadataObject.getMetadata().getInverseDBFileNo(), metadataObject.getMetadata().getInverseDBOffset()); if (inlinks != null) { System.out.println("Found Inlink Buffer of Size:" + inlinks.getCount()); FileSystem localFS = FileSystem.getLocal(conf); File testDir = new File("/tmp/dbIndexTest"); File testFile = new File("/tmp/dbIndexTestFile"); localFS.delete(new Path(testDir.getAbsolutePath()), true); localFS.delete(new Path(testFile.getAbsolutePath()), false); localFS.mkdirs(new Path(testDir.getAbsolutePath())); LOG.info("Creating Spill File of Inlinks"); spillLinkDataIntoTempFileIndex(fs, localFS, conf, masterIndex, testDir, new Path(testFile .getAbsolutePath()), inlinks); LOG.info("Created Spill File of Inlinks"); LOG.info("Reading Inlinks"); // ok now open it up and dump the first few inlinks from the // spill file SequenceFile.Reader reader = new SequenceFile.Reader( localFS, new Path(testFile.getAbsolutePath()), conf); TextBytes key = new TextBytes(); TriTextBytesTuple value = new TriTextBytesTuple(); CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata(); DataInputBuffer inputBuffer = new DataInputBuffer(); try { int itemCount = 0; while (reader.next(key, value)) { if (value.getThirdValue().getLength() != 0) { inputBuffer.reset(value.getThirdValue().getBytes(), 0, value.getThirdValue().getLength()); metadata.readFields(inputBuffer); System.out.println("INLINK:" + key.toString() + " METADATA STATUS:" + CrawlDatum.getStatusName(metadata.getStatus())); } else { System.out.println("INLINK:" + key.toString() + " NOMETADATA"); } if (++itemCount == 500) { break; } } } finally { reader.close(); } LOG.info("Done Reding Inlinks"); } } } } else { LOG.error("Query for FP:" + queryFP.getUrlHash() + " returned NULL URL"); } } } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }