package org.commoncrawl.mapred.pipelineV1; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.text.NumberFormat; import java.util.Collections; import java.util.Iterator; import java.util.Vector; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.file.tfile.TFile; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.protocol.ArchiveInfo; import org.commoncrawl.protocol.CrawlDatumAndMetadata; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.protocol.SubDomainMetadata; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CompressURLListV2; import org.commoncrawl.util.CrawlDatum; import org.commoncrawl.util.FSByteBufferInputStream; import org.commoncrawl.util.FileUtils; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.NodeAffinityMaskBuilder; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue; import org.commoncrawl.util.Tuples.LongTextBytesTuple; import org.commoncrawl.util.Tuples.TriTextBytesTuple; import org.commoncrawl.util.URLUtils.URLFPV2RawComparator; public class MetadataIndexBuilderV2 extends CrawlDBCustomJob { private static final Log LOG = LogFactory.getLog(MetadataIndexBuilderV2.class); @Override public String getJobDescription() { return "Metadata Index Builder"; } public static final String SUBDOMAIN_INDEX_ID_TO_METADATA = "idToMetadata"; public static final String SUBDOMAIN_INDEX_ID_TO_NAME = "idToString"; public static final String SUBDOMAIN_INDEX_NAME_TO_METADATA = "nameToMetadata"; @Override public void runJob() throws IOException { Vector<Long> timestamps = gatherDatabaseTimestamps(new Path("crawl/crawldb_new")); if (timestamps.size() >= 2) { long crawlDBTimestamp = timestamps.lastElement(); long candidateTimestamp = timestamps.get(timestamps.size() - 2); LOG.info("Using CrawlDB Candidate:" + crawlDBTimestamp); FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // build relevant paths ... Path crawlDBPath = new Path("crawl/crawldb_new/" + crawlDBTimestamp); Path crawlDBMetadataTemp = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/fptoMetadataIndexBuilder-crawlDBMetadata-" + candidateTimestamp); Path phase2OutputDir = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/fptoMetadataIndexBuilder-consolidatedMetadata-" + candidateTimestamp); Path s3MetadataTemp = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/s3Metadata-Temp"); fs.mkdirs(new Path("crawl/metadatadb/" + candidateTimestamp + "/urlMetadata")); fs.mkdirs(new Path("crawl/metadatadb/" + candidateTimestamp + "/subDomainMetadata")); Path seedOutputDir = new Path("crawl/metadatadb/" + candidateTimestamp + "/urlMetadata/seed"); Path indexOutputDir = new Path("crawl/metadatadb/" + candidateTimestamp + "/urlMetadata/index"); Path subDomainStatsRaw = new Path("crawl/metadatadb/" + candidateTimestamp + "/subDomainMetadata/raw"); Path subDomainIdToMetadataIndex = new Path("crawl/metadatadb/" + candidateTimestamp + "/subDomainMetadata/" + SUBDOMAIN_INDEX_ID_TO_METADATA); Path subDomainIdToNameIndex = new Path("crawl/metadatadb/" + candidateTimestamp + "/subDomainMetadata/" + SUBDOMAIN_INDEX_ID_TO_NAME); Path subDomainNameToMetadata = new Path("crawl/metadatadb/" + candidateTimestamp + "/subDomainMetadata/" + SUBDOMAIN_INDEX_NAME_TO_METADATA); Path s3Metadata = new Path("crawl/metadatadb/s3Metadata"); // Phase 1: Pull in S3 Info if (!fs.exists(s3Metadata)) { LOG.info("Collecting S3Metadata"); if (buildOldS3ArchiveInfo(candidateTimestamp, crawlDBPath, s3MetadataTemp)) { fs.rename(s3MetadataTemp, s3Metadata); } } // Phase 2: Consolidate Metadata from (PR Database,Link Databaase,Inverse // Link DB, and CrawlDB) if (!fs.exists(indexOutputDir)) { LOG.info("Running Metadata Index Builder"); if (buildConsolidatedMetadataIndex(candidateTimestamp, crawlDBPath, s3Metadata, phase2OutputDir)) { fs.rename(phase2OutputDir, indexOutputDir); // delete subdomain stats dir fs.delete(subDomainStatsRaw, true); // recreate it fs.mkdirs(subDomainStatsRaw); // now move sub domain stats files FileStatus subDomainFiles[] = fs.globStatus(new Path(indexOutputDir, "*.domainMetadata")); for (FileStatus subDomainFile : subDomainFiles) { Path originalLoc = subDomainFile.getPath(); Path newLocation = new Path(subDomainStatsRaw, originalLoc.getName().split("\\.")[0]); LOG.info("Moving: " + originalLoc + " to New Loc:" + newLocation); fs.rename(originalLoc, newLocation); } } } if (!fs.exists(subDomainIdToMetadataIndex)) { LOG.info("Generating Subdomain TFile"); Path subDomainMetadataTemp = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/subDomainMetadataTFILEGEN-" + candidateTimestamp); if (buildSubDomainIdToMetadataIndex(candidateTimestamp, subDomainStatsRaw, subDomainMetadataTemp)) { fs.delete(subDomainIdToNameIndex, true); LOG.info("Creating subDomainStringIndex file at:" + subDomainIdToNameIndex); fs.mkdirs(subDomainIdToNameIndex); FileStatus stringIndexFiles[] = fs.globStatus(new Path(subDomainMetadataTemp, "strings-part-*")); for (FileStatus stringIndexFile : stringIndexFiles) { LOG.info("Moving:" + stringIndexFile.getPath() + " to:" + subDomainIdToNameIndex); fs.rename(stringIndexFile.getPath(), new Path(subDomainIdToNameIndex, stringIndexFile.getPath().getName() .substring("strings-".length()))); } LOG.info("Moving :" + subDomainMetadataTemp + " to:" + subDomainIdToMetadataIndex); fs.rename(subDomainMetadataTemp, subDomainIdToMetadataIndex); } } if (!fs.exists(subDomainNameToMetadata)) { LOG.info("Generating SubDomain Name to Metadata Index"); Path subDomainMetadataTemp = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/subDomainMetadataNAMETOMETADATA-" + candidateTimestamp); if (buildSubDomainNameToMetadataIndex(candidateTimestamp, subDomainStatsRaw, subDomainMetadataTemp)) { fs.delete(subDomainNameToMetadata, true); LOG.info("Creating subDomainStringIndex file at:" + subDomainNameToMetadata); fs.rename(subDomainMetadataTemp, subDomainNameToMetadata); } } /* * // Phase 3 .. build a list of domains sorted by subdomain Path * phase3OutputDir = new * Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + * "/subDomains-Intermediate"+ candidateTimestamp); if * (!fs.exists(phase3OutputDir)) { * LOG.info("Running Phase 3 - Generate Intermediate Sub Domain List"); * buildIntermediateDomainList(urlSeedPath,phase3OutputDir); } */ Path phase4OutputDir = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/subDomains-Consolidated" + candidateTimestamp); } } public static class TFileBugWorkaroundDomainHashAndURLHashComparator implements RawComparator<Object> { DataInputBuffer stream1 = new DataInputBuffer(); DataInputBuffer stream2 = new DataInputBuffer(); public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { if (l1 == 0 && l2 != 0) return -1; else if (l1 != 0 && l2 == 0) return 1; else { try { stream1.reset(b1, s1, l1); stream2.reset(b2, s2, l2); long domainHash1 = stream1.readLong(); long domainHash2 = stream2.readLong(); int result = (domainHash1 == domainHash2) ? 0 : (domainHash1 < domainHash2) ? -1 : 1; if (result == 0) { long urlHash1 = stream1.readLong(); long urlHash2 = stream2.readLong(); result = (urlHash1 == urlHash2) ? 0 : (urlHash1 < urlHash2) ? -1 : 1; } return result; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); throw new RuntimeException("Unexpected IOException in RawComparator!!"); } } } @Override public int compare(Object o1, Object o2) { throw new RuntimeException("Not Supported!"); } } public static class TFileBugWorkaroundLongWritableComparator extends LongWritable.Comparator { public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { if (l1 == 0 && l2 != 0) return -1; else if (l1 != 0 && l2 == 0) return 1; else return super.compare(b1, s1, l1, b2, s2, l2); } } private boolean buildOldS3ArchiveInfo(long candidateTimestamp, Path crawlDBPath, Path outputPath) { try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); fs.delete(outputPath, true); Path searchPattern = new Path("crawl/s3metadata/*/part-[0-9]*"); JobConf job = new JobConf(conf); job.setJobName(getJobDescription() + " - Generte ArchiveInfo for S3Data"); // set node affinity ... String affinityMask = NodeAffinityMaskBuilder.buildNodeAffinityMask(FileSystem.get(job), crawlDBPath, null); NodeAffinityMaskBuilder.setNodeAffinityMask(job, affinityMask); // add parts FileStatus candidates[] = fs.globStatus(searchPattern); for (FileStatus candidate : candidates) { if (!candidate.getPath().toString().endsWith(".log")) { LOG.info("Adding Path:" + candidate.getPath()); FileInputFormat.addInputPath(job,candidate.getPath()); } } job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(URLFPV2.class); job.setMapOutputValueClass(ArchiveInfo.class); job.setMapperClass(S3MetadataMapper.class); job.setOutputKeyClass(URLFPV2.class); job.setOutputValueClass(ArchiveInfo.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setReducerClass(IdentityReducer.class); job.setNumTasksToExecutePerJvm(1000); FileOutputFormat.setOutputPath(job,outputPath); job.setNumReduceTasks(CrawlEnvironment.NUM_DB_SHARDS); LOG.info("Running " + getJobDescription() + " OutputDir:" + outputPath); JobClient.runJob(job); LOG.info("Finished Running " + getJobDescription() + " OutputDir:" + outputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } } public static class S3MetadataMapper implements Mapper<Text, CrawlURLMetadata, URLFPV2, ArchiveInfo> { enum Counters { BAD_URL, NO_FILENAME_IN_ARCHIVE_INFO } @Override public void map(Text key, CrawlURLMetadata metadata, OutputCollector<URLFPV2, ArchiveInfo> output, Reporter reporter) throws IOException { // map key to canoncial url fp URLFPV2 fp = URLUtils.getURLFPV2FromURL(key.toString()); if (fp != null) { ArchiveInfo archiveInfo = new ArchiveInfo(); archiveInfo.setArcfileOffset(metadata.getArcFileOffset()); // grab date from arc file name if (metadata.isFieldDirty(CrawlURLMetadata.Field_ARCFILENAME)) { int indexOfForwardSlash = metadata.getArcFileName().lastIndexOf('/'); int indexOfUnderscore = metadata.getArcFileName().indexOf('_'); int indexOfDot = metadata.getArcFileName().indexOf('.'); String timestampComponent = metadata.getArcFileName().substring(indexOfForwardSlash + 1, indexOfUnderscore); String idComponent = metadata.getArcFileName().substring(indexOfUnderscore + 1, indexOfDot); archiveInfo.setArcfileDate(Long.parseLong(timestampComponent)); archiveInfo.setArcfileIndex(Integer.parseInt(idComponent)); archiveInfo.setCrawlNumber(1); output.collect(fp, archiveInfo); } else { reporter.incrCounter(Counters.NO_FILENAME_IN_ARCHIVE_INFO, 1); } } else { reporter.incrCounter(Counters.BAD_URL, 1); } } @Override public void configure(JobConf job) { } @Override public void close() throws IOException { } } private boolean buildSubDomainIdToMetadataIndex(long candidateTimestamp, Path subDomainStatsRaw, Path subDomainFinalTemp) { try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); fs.delete(subDomainFinalTemp, true); JobConf job = new JobConf(conf); job.setJobName(getJobDescription() + " - Generate SubDomain TFile"); // add seed db as input FileInputFormat.addInputPath(job,subDomainStatsRaw); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SubDomainMetadata.class); job.setMapperClass(IdentityMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(NullOutputFormat.class); job.setReducerClass(SubDomainMetadataDomainIdToIndexWriter.class); job.setNumTasksToExecutePerJvm(1000); FileOutputFormat.setOutputPath(job,subDomainFinalTemp); job.setNumReduceTasks(CrawlEnvironment.NUM_DB_SHARDS); LOG.info("Running " + getJobDescription() + " OutputDir:" + subDomainFinalTemp); JobClient.runJob(job); LOG.info("Finished Running " + getJobDescription() + " OutputDir:" + subDomainFinalTemp); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } return true; } private boolean buildSubDomainNameToMetadataIndex(long candidateTimestamp, Path subDomainStatsRaw, Path subDomainFinalTemp) { try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); fs.delete(subDomainFinalTemp, true); JobConf job = new JobConf(conf); job.setJobName(getJobDescription() + " - Generate SubDomain Name to Metadata Index"); // add seed db as input FileInputFormat.addInputPath(job,subDomainStatsRaw); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(SubDomainMetadata.class); job.setMapperClass(SubDomainStatsToNameMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(NullOutputFormat.class); job.setReducerClass(SubDomainMetadataNameToIndexWriter.class); job.setNumTasksToExecutePerJvm(1000); FileOutputFormat.setOutputPath(job,subDomainFinalTemp); job.setNumReduceTasks(CrawlEnvironment.NUM_DB_SHARDS); LOG.info("Running " + getJobDescription() + " OutputDir:" + subDomainFinalTemp); JobClient.runJob(job); LOG.info("Finished Running " + getJobDescription() + " OutputDir:" + subDomainFinalTemp); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } return true; } public static class SubDomainStatsToNameMapper implements Mapper<LongWritable, SubDomainMetadata, Text, SubDomainMetadata> { @Override public void map(LongWritable key, SubDomainMetadata metadata, OutputCollector<Text, SubDomainMetadata> output, Reporter reporter) throws IOException { String domainName = metadata.getDomainText(); String normalized = URLUtils.normalizeHostName(domainName, true); if (normalized != null) { domainName = normalized; } output.collect(new Text(domainName), metadata); } @Override public void configure(JobConf job) { } @Override public void close() throws IOException { // TODO Auto-generated method stub } } public static class SubDomainMetadataNameToIndexWriter implements Reducer<Text, SubDomainMetadata, NullWritable, NullWritable> { JobConf _conf; FileSystem _fs; TFile.Writer _writer = null; FSDataOutputStream _outputStream = null; DataOutputBuffer _keyStream = new DataOutputBuffer(); DataOutputBuffer _valueStream = new DataOutputBuffer(); private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } @Override public void reduce(Text key, Iterator<SubDomainMetadata> values, OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException { if (_outputStream == null || _writer == null) { throw new IOException("Streams not initialized!"); } SubDomainMetadata firstValue = null; try { firstValue = (SubDomainMetadata) values.next().clone(); } catch (CloneNotSupportedException e) { } while (values.hasNext()) { SubDomainMetadata nextValue = values.next(); if (firstValue.getDomainText().length() == 0 && nextValue.getDomainText().length() != 0) { firstValue.setDomainText(nextValue.getDomainText()); } firstValue.setUrlCount(firstValue.getUrlCount() + nextValue.getUrlCount()); firstValue.setUnfetchedCount(firstValue.getUnfetchedCount() + nextValue.getUnfetchedCount()); firstValue.setFetchedCount(firstValue.getFetchedCount() + nextValue.getFetchedCount()); firstValue.setGoneCount(firstValue.getGoneCount() + nextValue.getGoneCount()); firstValue.setRedirectTemporaryCount(firstValue.getRedirectTemporaryCount() + nextValue.getRedirectTemporaryCount()); firstValue.setRedirectPermCount(firstValue.getRedirectPermCount() + nextValue.getRedirectPermCount()); firstValue.setUnmodifiedCount(firstValue.getUnmodifiedCount() + nextValue.getUnmodifiedCount()); firstValue.setHasPageRankCount(firstValue.getHasPageRankCount() + nextValue.getHasPageRankCount()); firstValue.setHasLinkListCount(firstValue.getHasLinkListCount() + nextValue.getHasLinkListCount()); firstValue.setHasInverseLinkListCount(firstValue.getHasInverseLinkListCount() + nextValue.getHasInverseLinkListCount()); firstValue.setHasArcFileInfoCount(firstValue.getHasArcFileInfoCount() + nextValue.getHasArcFileInfoCount()); firstValue.setHasParseSegmentInfoCount(firstValue.getHasParseSegmentInfoCount() + nextValue.getHasParseSegmentInfoCount()); firstValue.setHasSignatureCount(firstValue.getHasSignatureCount() + nextValue.getHasSignatureCount()); firstValue.setLatestFetchTime(Math.max(firstValue.getLatestFetchTime(), nextValue.getLatestFetchTime())); } _keyStream.reset(); _valueStream.reset(); key.write(_keyStream); firstValue.write(_valueStream); _writer.append(_keyStream.getData(), 0, _keyStream.getLength(), _valueStream.getData(), 0, _valueStream .getLength()); } @Override public void configure(JobConf job) { _conf = job; try { _fs = FileSystem.get(_conf); int partitionNumber = job.getInt("mapred.task.partition", -1); Path outputPath = new Path(FileOutputFormat.getWorkOutputPath(_conf), "part-" + NUMBER_FORMAT.format(partitionNumber)); _outputStream = _fs.create(outputPath); _writer = new TFile.Writer(_outputStream, 64 * 1024, TFile.COMPRESSION_LZO, TFile.COMPARATOR_JCLASS + Text.Comparator.class.getName(), _conf); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public void close() throws IOException { _writer.close(); _outputStream.close(); } } public static class SubDomainMetadataDomainIdToIndexWriter implements Reducer<LongWritable, SubDomainMetadata, NullWritable, NullWritable> { JobConf _conf; FileSystem _fs; TFile.Writer _writer = null; TFile.Writer _stringIndexWriter = null; FSDataOutputStream _outputStream = null; FSDataOutputStream _stringIndexOutputStream = null; DataOutputBuffer _keyStream = new DataOutputBuffer(); DataOutputBuffer _valueStream = new DataOutputBuffer(); DataOutputBuffer _stringsValueStream = new DataOutputBuffer(); private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } @Override public void reduce(LongWritable key, Iterator<SubDomainMetadata> values, OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException { if (_outputStream == null || _writer == null) { throw new IOException("Streams not initialized!"); } SubDomainMetadata firstValue = null; try { firstValue = (SubDomainMetadata) values.next().clone(); } catch (CloneNotSupportedException e) { } while (values.hasNext()) { SubDomainMetadata nextValue = values.next(); if (firstValue.getDomainText().length() == 0 && nextValue.getDomainText().length() != 0) { firstValue.setDomainText(nextValue.getDomainText()); } firstValue.setUrlCount(firstValue.getUrlCount() + nextValue.getUrlCount()); firstValue.setUnfetchedCount(firstValue.getUnfetchedCount() + nextValue.getUnfetchedCount()); firstValue.setFetchedCount(firstValue.getFetchedCount() + nextValue.getFetchedCount()); firstValue.setGoneCount(firstValue.getGoneCount() + nextValue.getGoneCount()); firstValue.setRedirectTemporaryCount(firstValue.getRedirectTemporaryCount() + nextValue.getRedirectTemporaryCount()); firstValue.setRedirectPermCount(firstValue.getRedirectPermCount() + nextValue.getRedirectPermCount()); firstValue.setUnmodifiedCount(firstValue.getUnmodifiedCount() + nextValue.getUnmodifiedCount()); firstValue.setHasPageRankCount(firstValue.getHasPageRankCount() + nextValue.getHasPageRankCount()); firstValue.setHasLinkListCount(firstValue.getHasLinkListCount() + nextValue.getHasLinkListCount()); firstValue.setHasInverseLinkListCount(firstValue.getHasInverseLinkListCount() + nextValue.getHasInverseLinkListCount()); firstValue.setHasArcFileInfoCount(firstValue.getHasArcFileInfoCount() + nextValue.getHasArcFileInfoCount()); firstValue.setHasParseSegmentInfoCount(firstValue.getHasParseSegmentInfoCount() + nextValue.getHasParseSegmentInfoCount()); firstValue.setHasSignatureCount(firstValue.getHasSignatureCount() + nextValue.getHasSignatureCount()); firstValue.setLatestFetchTime(Math.max(firstValue.getLatestFetchTime(), nextValue.getLatestFetchTime())); } _keyStream.reset(); _valueStream.reset(); _stringsValueStream.reset(); key.write(_keyStream); firstValue.write(_valueStream); String domainNameOut = firstValue.getDomainText(); // on normalize it String normalizedName = URLUtils.normalizeHostName(domainNameOut, true); if (normalizedName != null) { domainNameOut = normalizedName; } _stringsValueStream.writeUTF(domainNameOut); LOG.info("Key:" + key.get()); _writer.append(_keyStream.getData(), 0, _keyStream.getLength(), _valueStream.getData(), 0, _valueStream .getLength()); _stringIndexWriter.append(_keyStream.getData(), 0, _keyStream.getLength(), _stringsValueStream.getData(), 0, _stringsValueStream.getLength()); } @Override public void configure(JobConf job) { _conf = job; try { _fs = FileSystem.get(_conf); int partitionNumber = job.getInt("mapred.task.partition", -1); Path outputPath = new Path(FileOutputFormat.getWorkOutputPath(_conf), "part-" + NUMBER_FORMAT.format(partitionNumber)); _outputStream = _fs.create(outputPath); _writer = new TFile.Writer(_outputStream, 64 * 1024, TFile.COMPRESSION_LZO, TFile.COMPARATOR_JCLASS + TFileBugWorkaroundLongWritableComparator.class.getName(), _conf); Path stringsOutputPath = new Path(FileOutputFormat.getWorkOutputPath(_conf), "strings-part-" + NUMBER_FORMAT.format(partitionNumber)); _stringIndexOutputStream = _fs.create(stringsOutputPath); _stringIndexWriter = new TFile.Writer(_stringIndexOutputStream, 64 * 1024, TFile.COMPRESSION_LZO, TFile.COMPARATOR_JCLASS + TFileBugWorkaroundLongWritableComparator.class.getName(), _conf); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public void close() throws IOException { _writer.close(); _outputStream.close(); _stringIndexWriter.close(); _stringIndexOutputStream.close(); } } private void buildIntermediateDomainList(Path urlSeedPath, Path outputPath) throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); // delete exisiting output directory if it exists ... fs.delete(outputPath, true); JobConf job = new JobConf(conf); job.setJobName(getJobDescription() + " - Generate Domain List Intermediate"); // add seed db as input FileInputFormat.addInputPath(job,urlSeedPath); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setMapperClass(SeedDatabaseToHostNameMapper.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setCombinerClass(SeedDatabaseToHostNameCombiner.class); job.setReducerClass(SeedDatabaseToHostNameReducer.class); job.setNumTasksToExecutePerJvm(1000); FileOutputFormat.setOutputPath(job,outputPath); LOG.info("Running " + getJobDescription() + " OutputDir:" + outputPath); JobClient.runJob(job); LOG.info("Finished Running " + getJobDescription() + " OutputDir:" + outputPath); } public static class SeedDatabaseToHostNameCombiner implements Reducer<Text, IntWritable, Text, IntWritable> { @Override public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { output.collect(key, values.next()); } @Override public void configure(JobConf job) { } @Override public void close() throws IOException { } } public static class SeedDatabaseToHostNameReducer implements Reducer<Text, IntWritable, LongWritable, Text> { @Override public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { long fingerprint = URLFingerprint.generate64BitURLFPrint(key.toString()); output.collect(new LongWritable(fingerprint), key); } @Override public void configure(JobConf job) { } @Override public void close() throws IOException { } } public static class SeedDatabaseToHostNameMapper implements Mapper<URLFPV2, LongTextBytesTuple, Text, IntWritable> { IntWritable valueOut = new IntWritable(1); @Override public void map(URLFPV2 key, LongTextBytesTuple value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { // ok get the url from value String url = value.getTextValueBytes().toString(); // now extract components GoogleURL urlObject = new GoogleURL(url); // get host IF valid if (urlObject.isValid() && urlObject.getHost().length() != 0) { String host = urlObject.getHost(); output.collect(new Text(host), valueOut); } } @Override public void configure(JobConf job) { } @Override public void close() throws IOException { } } /** * Build a consolidated Metadata Index * * @param rootTimestamp * @param crawlDBMetadataLocation * @param pageRankMetadataLocation * @param linkDBDataLocation * @param inverseLinkDBDataLocation * @param outputPath * @return */ private boolean buildConsolidatedMetadataIndex(long rootTimestamp, Path crawlDBPath, Path s3Metadata, Path outputPath) { try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); // delete exisiting output directory if it exists ... fs.delete(outputPath, true); JobConf job = new JobConf(conf); job.setJobName(getJobDescription() + " - Merge Metadata"); // add prior job outputs // FileInputFormat.addInputPath(job,urldbSeedPath); FileInputFormat.addInputPath(job,crawlDBPath); FileInputFormat.addInputPath(job,s3Metadata); // set node affinity ... String affinityMask = NodeAffinityMaskBuilder.buildNodeAffinityMask(FileSystem.get(job), crawlDBPath, null); NodeAffinityMaskBuilder.setNodeAffinityMask(job, affinityMask); // set root timestamp in job ... job.setLong("root.timestamp", rootTimestamp); // multi file merger job.setInputFormat(MultiFileMergeInputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(ConsolidatedMetadataIndexWriter.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(NullOutputFormat.class); job.setPartitionerClass(MultiFileMergePartitioner.class); FileOutputFormat.setOutputPath(job,outputPath); job.setNumReduceTasks(92); job.setNumTasksToExecutePerJvm(1000); LOG.info("Running " + getJobDescription() + " OutputDir:" + outputPath); JobClient.runJob(job); LOG.info("Finished Running " + getJobDescription() + " OutputDir:" + outputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } } static Text HACK_URL_KEY = new Text("HACK_URL_KEY"); private void extractMetadataFromCrawlDB(Path affinityPath, Path outputPath) throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); JobConf job = new JobConf(conf); job.setJobName(getJobDescription() + " - Phase 1"); Path crawlDBPath = new Path("crawl/crawldb/current"); // add merged link db as input FileInputFormat.addInputPath(job,crawlDBPath); // set node affinity String nodeAffinityMask = NodeAffinityMaskBuilder.buildNodeAffinityMask(fs, affinityPath, null); LOG.info("Using NodeAffinityMask:" + nodeAffinityMask); NodeAffinityMaskBuilder.setNodeAffinityMask(job, nodeAffinityMask); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(FingerprintToDatumMapper.class); job.setMapOutputKeyClass(URLFPV2.class); job.setMapOutputValueClass(CrawlDatum.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(URLFPV2.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job,outputPath); job.setNumReduceTasks(92); job.setNumTasksToExecutePerJvm(1000); job.setCompressMapOutput(false); LOG.info("Running " + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Running" + job.getJobName()); } public static class FingerprintToDatumMapper implements Mapper<Text, CrawlDatum, URLFPV2, CrawlDatum> { @Override public void map(Text key, CrawlDatum value, OutputCollector<URLFPV2, CrawlDatum> output, Reporter reporter) throws IOException { URLFPV2 fingerprint = URLUtils.getURLFPV2FromURL(key.toString()); if (fingerprint != null) { value.getMetaData().put(HACK_URL_KEY, key); output.collect(fingerprint, value); } } @Override public void configure(JobConf job) { // TODO Auto-generated method stub } @Override public void close() throws IOException { // TODO Auto-generated method stub } } private Vector<Long> gatherDatabaseTimestamps(Path rootPath) throws IOException { Vector<Long> timestampsOut = new Vector<Long>(); FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*")); for (FileStatus candidate : candidates) { LOG.info("Found Seed Candidate:" + candidate.getPath()); long timestamp = Long.parseLong(candidate.getPath().getName()); timestampsOut.add(timestamp); } Collections.sort(timestampsOut); return timestampsOut; } /** * */ public static class ConsolidatedMetadataIndexWriter implements Reducer<IntWritable, Text, NullWritable, NullWritable> { JobConf _conf; FileSystem _fs; long _rootTimestamp; private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } enum Counters { HAD_ONLY_METADATA, HAD_ONLY_PR, HAD_ONLY_LINKDATA, HAD_ONLY_INVERSELINKDATA, HAD_METADATA_AND_PR, HAD_METADATA_AND_LINKDATA, HAD_METADATA_AND_INVERSELINKDATA, HAD_METADATA_AND_ALL_LINKDATA, HAD_ALL_DATA, HAD_METADATA_PR_AND_LINKDATA, HAD_METADATA_PR_AND_INVERSE_LINKDATA, HAD_LINKDATA_AND_INVERSE_LINKDATA, HAD_PR_LINKDATA_AND_INVERSE_LINKDATA, HAD_ONLY_URL, HAD_REDIRECT_LOCATION_IN_METADATA, FOUND_S3_ARCHIVEINFO, FOUND_CRAWLONE_ARCHIVE_INFO, FOUND_CURRENTCRAWL_ARCHIVE_INFO, FOUND_UNKNOWNCRAWL_ARCHIVE_INFO, URLDB_RECORD_WITHOUT_URL, HAD_URL, WROTE_SINGLE_RECORD, HAD_INV_LINKDB_DATA, HAD_LINKDB_DATA, HAD_PAGERANK } static final String FP_TO_PR_PATTERN = "fpToPR"; static final String LINKDB_PATTERN = "linkdb/merged"; static final String INVERSE_LINKDB_PATTERN = "inverse_linkdb/merged"; static final String CRAWLDB_METADATA_PATTERN = "/urlMetadata/seed"; static final String S3_METADATA_PATTERN = "/s3Metadata/"; static final String URLDB_PATTERN = "/crawl/crawldb_new"; static final int HAD_METADATA = 1 << 0; static final int HAD_PR = 1 << 1; static final int HAD_LINKDATA = 1 << 2; static final int HAD_INVERSE_LINKDATA = 1 << 3; static final int HAD_URL = 1 << 4; static final int MASK_HAD_METADATA_AND_PR = HAD_URL | HAD_METADATA | HAD_PR; static final int MASK_HAD_METADATA_AND_LINKDATA = HAD_URL | HAD_METADATA | HAD_LINKDATA; static final int MASK_HAD_METADATA_AND_INVERSE_LINKDATA = HAD_URL | HAD_METADATA | HAD_INVERSE_LINKDATA; static final int MASK_HAD_METADATA_AND_ALL_LINKDATA = HAD_URL | HAD_METADATA | HAD_LINKDATA | HAD_INVERSE_LINKDATA; static final int MASK_HAD_METADATA_PR_AND_LINKDATA = HAD_URL | HAD_METADATA | HAD_LINKDATA | HAD_PR; static final int MASK_HAD_METADATA_PR_AND_INVERSE_LINKDATA = HAD_URL | HAD_METADATA | HAD_INVERSE_LINKDATA | HAD_PR; static final int MASK_HAD_ALL_DATA = HAD_URL | HAD_METADATA | HAD_LINKDATA | HAD_INVERSE_LINKDATA | HAD_PR; static final int MASK_HAD_LINKDATA_AND_INVERSE_LINKDATA = HAD_LINKDATA | HAD_INVERSE_LINKDATA; static final int MASK_HAD_PR_AND_LINKDATA_AND_INVERSE_LINKDATA = HAD_PR | HAD_LINKDATA | HAD_INVERSE_LINKDATA; @Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); } // set up merge attributes JobConf localMergeConfig = new JobConf(_conf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, URLFPV2RawComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, URLFPV2.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<URLFPV2> multiFileInputReader = new MultiFileInputReader<URLFPV2>(_fs, incomingPaths, localMergeConfig); // now read one set of values at a time and output result KeyAndValueData<URLFPV2> keyValueData = null; DataOutputBuffer builderOutputBuffer = new DataOutputBuffer(); // create output paths ... // Path outputDataFile = new // Path(FileOutputFormat.getWorkOutputPath(_conf),"part-" + // NUMBER_FORMAT.format(key.get())); Path outputDataFile = new Path(FileOutputFormat.getWorkOutputPath(_conf), "part-" + NUMBER_FORMAT.format(key.get()) + ".data"); Path outputIndexFile = new Path(FileOutputFormat.getWorkOutputPath(_conf), "part-" + NUMBER_FORMAT.format(key.get()) + ".index"); Path domainMetadataIndexFile = new Path(FileOutputFormat.getWorkOutputPath(_conf), "part-" + NUMBER_FORMAT.format(key.get()) + ".domainMetadata"); LOG.info("Creating TFile Index at:" + outputDataFile); LOG.info("Creating DomainMetadata File at:" + domainMetadataIndexFile); // create output streams. ... // FSDataOutputStream dataStream = _fs.create(outputDataFile); FSDataOutputStream dataStream = _fs.create(outputDataFile); FSDataOutputStream indexStream = _fs.create(outputIndexFile); try { // and create tfile writer // TFile.Writer indexWriter = new TFile.Writer(dataStream,64 * // 1024,TFile.COMPRESSION_LZO,TFile.COMPARATOR_JCLASS + // TFileBugWorkaroundDomainHashAndURLHashComparator.class.getName(), // _conf); CompressURLListV2.Builder builder = new CompressURLListV2.Builder(indexStream, dataStream); try { // sub domain metadata writer SequenceFile.Writer domainMetadataWriter = SequenceFile.createWriter(_fs, _conf, domainMetadataIndexFile, LongWritable.class, SubDomainMetadata.class); try { DataOutputBuffer finalOutputBuffer = new DataOutputBuffer(); DataInputBuffer inputBuffer = new DataInputBuffer(); TriTextBytesTuple tupleOut = new TriTextBytesTuple(); DataOutputBuffer fastLookupBuffer = new DataOutputBuffer(); SubDomainMetadata metadata = null; // LongTextBytesTuple urlTuple = new LongTextBytesTuple(); DataOutputBuffer datumStream = new DataOutputBuffer(); DataOutputBuffer keyBuffer = new DataOutputBuffer(); TextBytes textValueBytes = new TextBytes(); // start reading merged values ... while ((keyValueData = multiFileInputReader.readNextItem()) != null) { // ok metadata we are going to write into CrawlDatumAndMetadata datumAndMetadata = new CrawlDatumAndMetadata(); TextBytes urlBytes = null; Vector<ArchiveInfo> s3Items = new Vector<ArchiveInfo>(); boolean dirty = false; int mask = 0; metadata = createOrFlushSubDomainMetadata(keyValueData._keyObject, metadata, domainMetadataWriter); // increment url count metadata.setUrlCount(metadata.getUrlCount() + 1); // walk values ... for (RawRecordValue value : keyValueData._values) { String path = value.source.toString(); inputBuffer.reset(value.data.getData(), value.data.getLength()); if (path.contains(S3_METADATA_PATTERN)) { ArchiveInfo s3ArchiveInfo = new ArchiveInfo(); s3ArchiveInfo.readFields(inputBuffer); s3Items.add(s3ArchiveInfo); reporter.incrCounter(Counters.FOUND_S3_ARCHIVEINFO, 1); } else if (path.contains(URLDB_PATTERN)) { mask |= HAD_METADATA; datumAndMetadata.readFields(inputBuffer); urlBytes = datumAndMetadata.getUrlAsTextBytes(); if (urlBytes != null && urlBytes.getLength() != 0) { mask |= HAD_URL; if (!metadata.isFieldDirty(SubDomainMetadata.Field_DOMAINTEXT)) { String url = urlBytes.toString(); String domain = URLUtils.fastGetHostFromURL(url); if (domain != null && domain.length() != 0) { metadata.setDomainText(domain); } } // update subdomain metadata switch (datumAndMetadata.getStatus()) { case CrawlDatum.STATUS_DB_UNFETCHED: metadata.setUnfetchedCount(metadata.getUnfetchedCount() + 1); break; case CrawlDatum.STATUS_DB_FETCHED: metadata.setFetchedCount(metadata.getFetchedCount() + 1); break; case CrawlDatum.STATUS_DB_GONE: metadata.setGoneCount(metadata.getGoneCount() + 1); break; case CrawlDatum.STATUS_DB_REDIR_TEMP: metadata.setRedirectTemporaryCount(metadata.getRedirectTemporaryCount() + 1); break; case CrawlDatum.STATUS_DB_REDIR_PERM: metadata.setRedirectPermCount(metadata.getRedirectPermCount() + 1); break; case CrawlDatum.STATUS_DB_NOTMODIFIED: metadata.setUnmodifiedCount(metadata.getUnmodifiedCount() + 1); break; } // update fetch time stats metadata.setLatestFetchTime(Math .max(metadata.getLatestFetchTime(), datumAndMetadata.getFetchTime())); CrawlURLMetadata metadataObj = (CrawlURLMetadata) datumAndMetadata.getMetadata(); // clear some invalid fields ... metadataObj.setFieldClean(CrawlURLMetadata.Field_SIGNATURE); metadataObj.setFieldClean(CrawlURLMetadata.Field_HOSTFP); metadataObj.setFieldClean(CrawlURLMetadata.Field_URLFP); metadataObj.setFieldClean(CrawlURLMetadata.Field_CONTENTFILESEGNO); metadataObj.setFieldClean(CrawlURLMetadata.Field_CONTENTFILENAMEANDPOS); metadataObj.setFieldClean(CrawlURLMetadata.Field_PARSEDATASEGNO); metadataObj.setFieldClean(CrawlURLMetadata.Field_CRAWLNUMBER); metadataObj.setFieldClean(CrawlURLMetadata.Field_PARSENUMBER); metadataObj.setFieldClean(CrawlURLMetadata.Field_UPLOADNUMBER); metadataObj.setFieldClean(CrawlURLMetadata.Field_ARCFILEDATE); metadataObj.setFieldClean(CrawlURLMetadata.Field_ARCFILEINDEX); metadataObj.setFieldClean(CrawlURLMetadata.Field_ARCFILENAME); metadataObj.setFieldClean(CrawlURLMetadata.Field_ARCFILEOFFSET); metadataObj.setFieldClean(CrawlURLMetadata.Field_ARCFILESIZE); metadataObj.setFieldClean(CrawlURLMetadata.Field_LINKDBTIMESTAMP); metadataObj.setFieldClean(CrawlURLMetadata.Field_INVERSEDBTIMESTAMP); metadataObj.setFieldClean(CrawlURLMetadata.Field_INVERSEDBEXTRADOMAININLINKCOUNT); metadataObj.setFieldClean(CrawlURLMetadata.Field_INVERSEDBINTRADOMAININLINKCOUNT); metadataObj.setFieldClean(CrawlURLMetadata.Field_PAGERANKTIMESTAMP); metadataObj.setFieldClean(CrawlURLMetadata.Field_PAGERANKVALUEOLD); metadataObj.getParseSegmentInfo().clear(); // clear url field as we store it in a separate location ... metadataObj.setFieldClean(CrawlDatumAndMetadata.Field_URL); // update sub domain stats ... if (metadataObj.getArchiveInfo().size() != 0) { metadata.setHasArcFileInfoCount(metadata.getHasArcFileInfoCount() + 1); } if (metadataObj.getParseSegmentInfo().size() != 0) { metadata.setHasParseSegmentInfoCount(metadata.getHasParseSegmentInfoCount() + 1); } if (metadataObj.isFieldDirty(CrawlURLMetadata.Field_PAGERANK)) { reporter.incrCounter(Counters.HAD_PAGERANK, 1); metadata.setHasPageRankCount(metadata.getHasPageRankCount() + 1); } if (metadataObj.isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) { reporter.incrCounter(Counters.HAD_LINKDB_DATA, 1); metadata.setHasLinkListCount(metadata.getHasLinkListCount() + 1); } if (metadataObj.isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) { reporter.incrCounter(Counters.HAD_INV_LINKDB_DATA, 1); metadata.setHasInverseLinkListCount(metadata.getHasInverseLinkListCount() + 1); } dirty = true; } else { reporter.incrCounter(Counters.URLDB_RECORD_WITHOUT_URL, 1); } } } // URL IS VALID if (((mask & HAD_URL) != 0)) { reporter.incrCounter(Counters.HAD_URL, 1); // add any s3 archive information datumAndMetadata.getMetadata().getArchiveInfo().addAll(s3Items); // ok only keep last value archive info ArchiveInfo lastValidArchiveInfo = null; int archiveInfoCount = 0; for (ArchiveInfo archiveInfo : datumAndMetadata.getMetadata().getArchiveInfo()) { if (lastValidArchiveInfo == null || lastValidArchiveInfo.getArcfileDate() < archiveInfo.getArcfileDate()) { lastValidArchiveInfo = archiveInfo; } ++archiveInfoCount; } if (lastValidArchiveInfo != null) { // clear archive info datumAndMetadata.getMetadata().getArchiveInfo().clear(); datumAndMetadata.getMetadata().getArchiveInfo().add(lastValidArchiveInfo); if (lastValidArchiveInfo.getCrawlNumber() == 1) { reporter.incrCounter(Counters.FOUND_CRAWLONE_ARCHIVE_INFO, 1); } else if (lastValidArchiveInfo.getCrawlNumber() == CrawlEnvironment.getCurrentCrawlNumber()) { reporter.incrCounter(Counters.FOUND_CURRENTCRAWL_ARCHIVE_INFO, 1); } else { reporter.incrCounter(Counters.FOUND_UNKNOWNCRAWL_ARCHIVE_INFO, 1); } } // ok initialize tuple .. first value is url ... tupleOut.setFirstValue(new TextBytes(urlBytes)); if (dirty) { datumAndMetadata.setIsValid((byte) 1); // second value is special fastLookupBuffer.reset(); // write page rank value fastLookupBuffer.writeFloat(datumAndMetadata.getMetadata().getPageRank()); // write fetch status fastLookupBuffer.writeByte(datumAndMetadata.getStatus()); // protocol status fastLookupBuffer.writeByte(datumAndMetadata.getProtocolStatus()); // write fetch time fastLookupBuffer.writeLong(datumAndMetadata.getFetchTime()); // ok write this buffer into second tuple value tupleOut.getSecondValue().set(fastLookupBuffer.getData(), 0, fastLookupBuffer.getLength()); // ok write out datum and metadata to stream datumStream.reset(); datumAndMetadata.write(datumStream); // set third value in output tuple tupleOut.getThirdValue().set(datumStream.getData(), 0, datumStream.getLength()); } else { tupleOut.getSecondValue().clear(); tupleOut.getThirdValue().clear(); } // reset composite buffer finalOutputBuffer.reset(); // write tuple into it -- TODO: DOUBLE BUFFER COPIES SUCK!!! tupleOut.write(finalOutputBuffer); // write out key value keyBuffer.reset(); keyBuffer.writeLong(keyValueData._keyObject.getDomainHash()); keyBuffer.writeLong(keyValueData._keyObject.getUrlHash()); textValueBytes.set(finalOutputBuffer.getData(), 0, finalOutputBuffer.getLength()); // output final value to index builder ... /* * indexWriter.append( keyBuffer.getData(), 0, * keyBuffer.getLength(), finalOutputBuffer.getData(), 0, * finalOutputBuffer.getLength()); */ reporter.incrCounter(Counters.WROTE_SINGLE_RECORD, 1); builder.addItem(keyValueData._keyObject, textValueBytes); // update stats if (mask == HAD_URL) { reporter.incrCounter(Counters.HAD_ONLY_URL, 1); } else if (mask == HAD_METADATA) { reporter.incrCounter(Counters.HAD_ONLY_METADATA, 1); } else if (mask == HAD_PR) { reporter.incrCounter(Counters.HAD_ONLY_PR, 1); } else if (mask == HAD_LINKDATA) { reporter.incrCounter(Counters.HAD_ONLY_LINKDATA, 1); } else if (mask == HAD_INVERSE_LINKDATA) { reporter.incrCounter(Counters.HAD_ONLY_INVERSELINKDATA, 1); } else if (mask == MASK_HAD_METADATA_AND_PR) { reporter.incrCounter(Counters.HAD_METADATA_AND_PR, 1); } else if (mask == MASK_HAD_METADATA_AND_LINKDATA) { reporter.incrCounter(Counters.HAD_METADATA_AND_LINKDATA, 1); } else if (mask == MASK_HAD_METADATA_AND_INVERSE_LINKDATA) { reporter.incrCounter(Counters.HAD_METADATA_AND_INVERSELINKDATA, 1); } else if (mask == MASK_HAD_METADATA_AND_ALL_LINKDATA) { reporter.incrCounter(Counters.HAD_METADATA_AND_ALL_LINKDATA, 1); } else if (mask == MASK_HAD_METADATA_PR_AND_LINKDATA) { reporter.incrCounter(Counters.HAD_METADATA_PR_AND_LINKDATA, 1); } else if (mask == MASK_HAD_METADATA_PR_AND_INVERSE_LINKDATA) { reporter.incrCounter(Counters.HAD_METADATA_PR_AND_INVERSE_LINKDATA, 1); } else if (mask == MASK_HAD_ALL_DATA) { reporter.incrCounter(Counters.HAD_ALL_DATA, 1); } else if (mask == MASK_HAD_LINKDATA_AND_INVERSE_LINKDATA) { reporter.incrCounter(Counters.HAD_LINKDATA_AND_INVERSE_LINKDATA, 1); } else if (mask == MASK_HAD_PR_AND_LINKDATA_AND_INVERSE_LINKDATA) { reporter.incrCounter(Counters.HAD_PR_LINKDATA_AND_INVERSE_LINKDATA, 1); } } // report progress to keep reducer alive reporter.progress(); } // flush trailing domain metadata entry ... if (metadata != null) { domainMetadataWriter.append(new LongWritable(metadata.getDomainHash()), metadata); } } finally { domainMetadataWriter.close(); } } finally { // indexWriter.close(); builder.close(); } } finally { dataStream.close(); indexStream.close(); } } @Override public void configure(JobConf job) { _conf = job; try { _fs = FileSystem.get(_conf); _rootTimestamp = job.getLong("root.timestamp", -1); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public void close() throws IOException { // TODO Auto-generated method stub } private static SubDomainMetadata createOrFlushSubDomainMetadata(URLFPV2 currentFP, SubDomainMetadata metadata, SequenceFile.Writer writer) throws IOException { if (metadata != null) { // if current fingerprint is for a different subdomain .. if (currentFP.getDomainHash() != metadata.getDomainHash()) { // flush the existing record writer.append(new LongWritable(metadata.getDomainHash()), metadata); // null out metadata metadata = null; } } // if metadata is null ... if (metadata == null) { // allocate a fresh new record for this new subdomain metadata = new SubDomainMetadata(); metadata.setDomainHash(currentFP.getDomainHash()); metadata.setRootDomainHash(currentFP.getRootDomainHash()); } return metadata; } } private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } public static class TFileHolder { public FSDataInputStream _inputStream; public TFile.Reader _reader; public TFile.Reader.Scanner _scanner; } public static final int OUTER_ITERATION_COUNT = 2; private static void doRegExTest(int driveCount, long candidateTS, Configuration conf, FileSystem fs, String queryString) throws IOException { FileSystem remoteFS = FileSystem.get(conf); Pattern pattern = Pattern.compile(queryString); SubDomainMetadata metadata = new SubDomainMetadata(); for (int i = 0; i < 2; ++i) { int disk = i % driveCount; Path hdfsPath = new Path("crawl/metadatadb/" + candidateTS + "/subDomainMetadata/tfile/part-" + NUMBER_FORMAT.format(i)); File filePath = new File("/data/" + disk + "/subDomainMetadata/" + candidateTS + "/part-" + NUMBER_FORMAT.format(i)); for (int attempt = 0; attempt < 2; ++attempt) { FSDataInputStream inputStream = null; long length = 0; if (attempt == 0) { LOG.info("Scanning:" + filePath); byte[] dataBuffer = FileUtils.readFully(filePath); ByteBuffer wrapped = ByteBuffer.wrap(dataBuffer); inputStream = new FSDataInputStream(new FSByteBufferInputStream(wrapped)); length = filePath.length(); LOG.info("Using ByteBufferInputStream"); } else { LOG.info("Scanning:" + hdfsPath); inputStream = remoteFS.open(hdfsPath); FileStatus status = remoteFS.getFileStatus(hdfsPath); if (status != null) length = status.getLen(); LOG.info("Using HDFS Stream"); } long timeStart = System.currentTimeMillis(); // FSDataInputStream inputStream = fs.open(new // Path(filePath.getAbsolutePath())); try { TFile.Reader reader = new TFile.Reader(inputStream, length, conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { while (!scanner.atEnd()) { DataInputStream stream = scanner.entry().getValueStream(); metadata.readFields(stream); String domainName = metadata.getDomainText(); if (pattern.matcher(domainName).matches()) { LOG.info("Matched:" + domainName); } scanner.advance(); } } finally { scanner.close(); } } finally { reader.close(); } } finally { inputStream.close(); } long timeEnd = System.currentTimeMillis(); LOG.info("Scan of File:" + filePath + " took:" + (timeEnd - timeStart) + " MS"); } } } }