package org.commoncrawl.mapred.pipelineV3.crawllistgen;
import java.io.IOException;
import java.net.URI;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.SegmentGeneratorBundleKey;
import org.commoncrawl.mapred.SegmentGeneratorItem;
import org.commoncrawl.mapred.SegmentGeneratorItemBundle;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner;
import org.commoncrawl.util.S3NFileSystem;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.Tuples.Pair;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
public class NewGenBundlesStep extends CrawlPipelineStep {
private static final Log LOG = LogFactory.getLog(NewGenBundlesStep.class);
public static final String OUTPUT_DIR_NAME = "bundlesGenerator";
static final int NUM_BITS = 11;
static final int NUM_ELEMENTS = 1 << 28;
static final int FLUSH_THRESHOLD = 1 << 23;
public static final int SPILL_THRESHOLD = 250;
enum Counters {
SPILLED_1_MILLION_SKIPPED_REST, DOMAIN_WITH_GT_10MILLION_URLS, DOMAIN_WITH_GT_1MILLION_URLS, DOMAIN_WITH_GT_100K_URLS, DOMAIN_WITH_GT_50K_URLS, DOMAIN_WITH_GT_10K_URLS, DOMAIN_WITH_GT_1K_URLS, DOMAIN_WITH_GT_100_URLS, DOMAIN_WITH_GT_10_URLS, DOMAIN_WITH_LT_10_URLS, DOMAIN_WITH_1_URL, INVALID_SCHEME, INVALID_URL_OBJECT, SKIPPING_ALREADY_EMITTED_URL, NULL_FP_FOR_URL, NO_SOURCE_URL_IN_JSON, GENERATING_HOME_PAGE_URL, EMITTING_URL_OBJECT, GOT_RAW_RECORD_ITERATOR, GET_NEXT_RECORD_FROM_MERGER, GOT_RAW_RECORD_FROM_ITERATOR
}
public NewGenBundlesStep(CrawlPipelineTask task) {
super(task, "Generate Bundles", OUTPUT_DIR_NAME);
}
@Override
public Log getLogger() {
return LOG;
}
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
static final String SINGLE_PARTITION_PROPERTY = "bundlegen.singlePartition";
public void addCrawlListPaths(Configuration conf,int restrictedPartitionId,ArrayList<Path> pathsOut) throws IOException {
// get paritioned list path
Path partitionedListPath = getOutputDirForStep(NewPartitionUrlsStep.class);
FileSystem fs = FileSystem.get(partitionedListPath.toUri(),conf);
Path filterPath = new Path(partitionedListPath,"[0-9]*");
for (FileStatus partitionPath : fs.globStatus(filterPath)) {
if (restrictedPartitionId != -1) {
pathsOut.add(new Path(partitionPath.getPath(),"part-" + NUMBER_FORMAT.format(restrictedPartitionId)));
}
else {
for (FileStatus part : fs.globStatus(new Path(partitionPath.getPath(),"part-*"))) {
pathsOut.add(part.getPath());
}
}
}
}
public void addWikipediaPaths(Configuration conf,int restrictedPartitionId,ArrayList<Path> pathsOut) throws IOException {
// get partitioned list path
Path wikipediaURLSPath = getOutputDirForStep(PartitionWikipediaUrlsStep.class);
FileSystem fs = FileSystem.get(wikipediaURLSPath.toUri(),conf);
if (restrictedPartitionId != -1) {
pathsOut.add(new Path(wikipediaURLSPath,"part-" + NUMBER_FORMAT.format(restrictedPartitionId)));
}
else {
for (FileStatus part : fs.globStatus(new Path(wikipediaURLSPath,"part-*"))) {
pathsOut.add(part.getPath());
}
}
}
@Override
public void runStep(Path outputPathLocation) throws IOException {
LOG.info("Task Identity Path is:" + getTaskIdentityPath());
LOG.info("Temp Path is:" + outputPathLocation);
DomainMetadataTask rootTask = (DomainMetadataTask) getRootTask();
Configuration conf = new Configuration(rootTask.getConf());
// check for restricted partition id ...
int restrictedPartitionId = rootTask.getConf().getInt(SINGLE_PARTITION_PROPERTY, -1);
// collect paths ...
ArrayList<Path> paths = new ArrayList<Path>();
addCrawlListPaths(conf,restrictedPartitionId,paths);
addWikipediaPaths(conf,restrictedPartitionId,paths);
JobConf jobConf = new JobBuilder("Generate Bundles", getConf())
.inputs(paths)
.inputFormat(MultiFileMergeInputFormat.class)
.mapperKeyValue(IntWritable.class, Text.class)
.outputKeyValue(SegmentGeneratorBundleKey.class, SegmentGeneratorItemBundle.class)
.outputIsSeqFile()
.reducer(BundleGenerator.class, false)
.partition(MultiFileMergePartitioner.class)
.speculativeExecution(false)
.output(outputPathLocation)
.compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class)
.build();
jobConf.setBoolean(MultiFileMergeInputFormat.PARTS_ARE_FILES_PROPERTY,true);
if (restrictedPartitionId != -1) {
jobConf.setNumReduceTasks(1);
}
else {
jobConf.setNumReduceTasks(CrawlListGenCommon.NUM_LIST_PARTITIONS);
}
LOG.info("Starting JOB");
JobClient.runJob(jobConf);
LOG.info("Finsihed JOB");
}
public static class BundleGenerator implements Reducer<IntWritable,Text,SegmentGeneratorBundleKey,SegmentGeneratorItemBundle> {
Configuration _conf;
boolean _skipDomain = false;
boolean _currentRootDomainIdValid = false;
long _currentRootDomainId = -1;
boolean _currentSubDomainIdValid = false;
boolean _genHomePageURLForSubDomain = false;
long _currentSubDomainId = -1;
int _currentRootDomainURLCount = 0;
int _currentRootDomainSpilledItemCount = 0;
// spill state ...
ArrayList<SegmentGeneratorItem> items = new ArrayList<SegmentGeneratorItem>();
int currentDomainCrawlIdx = -1;
SegmentGeneratorItemBundle currentBundle = null;
double accumulatedRank = 0.0;
int currentBundleId = 0;
OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> _collector = null;
int crawlerCount = CrawlEnvironment.NUM_CRAWLERS;
static final int NUM_HASH_FUNCTIONS = 10;
static final int NUM_BITS = 11;
static final int NUM_ELEMENTS = 1 << 28;
static final int FLUSH_THRESHOLD = 1 << 23;
URLFPBloomFilter emittedTuplesFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
long urlsInFilter = 0;
@Override
public void configure(JobConf job) {
_conf = job;
}
@Override
public void close() throws IOException {
}
private static FileSystem getFileSystemForMergePath(Path path,Configuration conf)throws IOException {
// override S3N
if (path.toUri().getScheme().equalsIgnoreCase("s3n")) {
FileSystem fs = new S3NFileSystem();
fs.initialize(path.toUri(), conf);
return fs;
}
// conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class);
return FileSystem.get(path.toUri(),conf);
}
@SuppressWarnings("resource")
Pair<FileSystem,List<Path>> buildInputPathList(Configuration conf,Iterator<Text> values)throws IOException {
// collect all incoming paths first
ArrayList<Path> incomingPaths = Lists.newArrayList();
Set<String> fsType = new HashSet<String>();
while(values.hasNext()){
String path = values.next().toString();
LOG.info("Found Incoming Path:" + path);
incomingPaths.add(new Path(path));
// convert to uri ...
URI uri = new Path(path).toUri();
// get scheme if present ...
String scheme = uri.getScheme();
if (scheme == null || scheme.length() == 0) {
fsType.add("default");
}
else {
fsType.add(scheme);
}
}
if (fsType.size() != 1) {
throw new IOException("Only One Input Scheme at a time supported!");
}
// determine filesytem
FileSystem fs = null;
if (fsType.contains("s3n")) {
fs = new S3NFileSystem();
fs.initialize(incomingPaths.get(0).toUri(), conf);
}
else {
fs = FileSystem.get(incomingPaths.get(0).toUri(), conf);
}
return new Pair<FileSystem, List<Path>>(fs,incomingPaths);
}
static class RawValueIterator implements Iterator<TextBytes> {
CrawlListKey key = new CrawlListKey();
TextBytes valueBytes = new TextBytes();
DataInputBuffer keyInputBuffer = new DataInputBuffer();
DataInputBuffer inputBuffer = new DataInputBuffer();
Path currentSource = null;
Iterator<RawRecordValue> rawIterator;
void reset(Iterable<RawRecordValue> rawIterable) {
this.rawIterator = rawIterable.iterator();
}
@Override
public boolean hasNext() {
return rawIterator.hasNext();
}
CrawlListKey currentKey() {
return key;
}
Path currentSource() {
return currentSource;
}
@Override
public TextBytes next(){
try {
RawRecordValue nextRawValue = rawIterator.next();
// read in text bytes key ...
keyInputBuffer.reset(nextRawValue.key.getData(),0,nextRawValue.key.getLength());
inputBuffer.reset(nextRawValue.data.getData(),0,nextRawValue.data.getLength());
int valueTextLen = WritableUtils.readVInt(inputBuffer);
valueBytes.set(nextRawValue.data.getData(),inputBuffer.getPosition(),valueTextLen);
key.readFields(keyInputBuffer);
currentSource = nextRawValue.source;
return valueBytes;
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new RuntimeException(e);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException("remove");
}
}
/** helper method **/
private SegmentGeneratorItemBundle getBundleForDomain(long domainFP) throws IOException {
currentBundle = new SegmentGeneratorItemBundle();
currentBundle.setHostFP(domainFP);
return currentBundle;
}
/** generate a bundle from the given list of items and simultaneously flush it **/
private void generateABundle(long domainFP, List<SegmentGeneratorItem> items, Reporter reporter) throws IOException {
SegmentGeneratorItemBundle bundle = getBundleForDomain(domainFP);
// LOG.info("Generating Bundle:" + currentBundleId + " for DH:" + domainFP);
float maxPageRank = 0.0f;
for (SegmentGeneratorItem item : items) {
// LOG.info("URL:" + item.getUrl() + " Status:" +
// CrawlDatum.getStatusName(item.getStatus()) +" PR:" +
// item.getMetadata().getPageRank());
bundle.getUrls().add(item);
_currentRootDomainURLCount++;
maxPageRank = Math.max(maxPageRank, item.getPageRank());
}
// LOG.info("Done Generating Bunlde - PR is:" + maxPageRank);
// set page rank for bundle
bundle.setMaxPageRank(maxPageRank);
flushCurrentBundle(reporter);
}
/** flush the currently active bundle **/
private void flushCurrentBundle(Reporter reporter) throws IOException {
if (currentBundle != null && currentBundle.getUrls().size() != 0) {
int crawlerIndex = (((Long)currentBundle.getHostFP()).hashCode() & Integer.MAX_VALUE) % crawlerCount;
// generate a bundle key
SegmentGeneratorBundleKey bundleKey = new SegmentGeneratorBundleKey();
bundleKey.setRecordType(0);
bundleKey.setCrawlerId(crawlerIndex);
bundleKey.setDomainFP(_currentRootDomainId);
// and increment bundle id ...
bundleKey.setBundleId(currentBundleId++);
bundleKey.setAvgPageRank((float) accumulatedRank / (float)currentBundle.getUrls().size());
if (reporter != null) {
reporter.incrCounter("CRAWLER_", Long.toString(crawlerIndex) + "_BUNDLE_COUNT", 1);
}
// ok spill bundle ...
_collector.collect(bundleKey, currentBundle);
}
// current bundle is now null
currentBundle = null;
accumulatedRank = 0.0;
}
/** spill cached items **/
private void spillItems(Reporter reporter) throws IOException {
// if item count exceeds spill threshold .. or we ran out of data ...
if (items.size() != 0) {
// LOG.info("Spilling Bundle:" + currentBundleId + " for DH:" +
// currentDomain + " ItemCount:" + subList.size());
// flush items
generateABundle(_currentRootDomainId, items, reporter);
if (reporter != null) {
reporter.progress();
}
// ok, increment counts ...
_currentRootDomainSpilledItemCount += items.size();
//if (_currentRootDomainSpilledItemCount >= 1000000) {
reporter.incrCounter(Counters.SPILLED_1_MILLION_SKIPPED_REST, 1);
//_skipDomain = true;
//}
}
// reset list ...
items.clear();
}
private void flushRootDomain(Reporter reporter) throws IOException {
if (items.size() != 0) {
spillItems(reporter);
}
if (reporter != null) {
if (_currentRootDomainSpilledItemCount >= 10000000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_10MILLION_URLS, 1);
} else if (_currentRootDomainSpilledItemCount >= 1000000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_1MILLION_URLS, 1);
} else if (_currentRootDomainSpilledItemCount >= 100000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_100K_URLS, 1);
} else if (_currentRootDomainSpilledItemCount >= 50000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_50K_URLS, 1);
} else if (_currentRootDomainSpilledItemCount >= 10000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_10K_URLS, 1);
} else if (_currentRootDomainSpilledItemCount >= 1000) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_1K_URLS, 1);
} else if (_currentRootDomainSpilledItemCount >= 100) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_100_URLS, 1);
} else if (_currentRootDomainSpilledItemCount >= 10) {
reporter.incrCounter(Counters.DOMAIN_WITH_GT_10_URLS, 1);
} else if (_currentRootDomainSpilledItemCount > 1) {
reporter.incrCounter(Counters.DOMAIN_WITH_LT_10_URLS, 1);
} else if (_currentRootDomainSpilledItemCount == 1) {
reporter.incrCounter(Counters.DOMAIN_WITH_1_URL, 1);
}
}
int crawlerIndex = (((Long)_currentRootDomainId).hashCode() & Integer.MAX_VALUE) % crawlerCount;
if (reporter != null) {
reporter.incrCounter("CRAWLER_", Long.toString(crawlerIndex), 1);
}
_currentRootDomainIdValid = false;
_currentRootDomainId = -1;
_currentSubDomainIdValid = false;
_genHomePageURLForSubDomain = true;
_currentSubDomainId = -1;
currentDomainCrawlIdx = -1;
_currentRootDomainSpilledItemCount = 0;
_currentRootDomainURLCount = 0;
}
/** potentially reset state based on domain id transition **/
private void rootDomainTransition(long newDomainFP,Reporter reporter) throws IOException {
if (_currentRootDomainIdValid) {
flushRootDomain(reporter);
}
_skipDomain = false;
// zero out item count ...
items.clear();
// reset domain id
_currentRootDomainId = newDomainFP;
_currentRootDomainIdValid = true;
currentDomainCrawlIdx = (((int) _currentRootDomainId & Integer.MAX_VALUE) % crawlerCount);
// reset current domain url count
_currentRootDomainURLCount = 0;
// and reset last bundle id
currentBundleId = 0;
// reset spill count for domain
_currentRootDomainSpilledItemCount = 0;
}
Set<String> validSchemes = new ImmutableSet.Builder<String>()
.add("http")
.add("https")
.build();
//static Set<String> = ImmutableSet.Builder<String>
static String makeHomePageURLFromUrlObject(GoogleURL urlObject) {
String urlOut = urlObject.getScheme();
urlOut += ("://");
if (urlObject.getUserName() != GoogleURL.emptyString) {
urlOut += (urlObject.getUserName());
if (urlObject.getPassword() != GoogleURL.emptyString) {
urlOut += (":");
urlOut += (urlObject.getPassword());
}
urlOut += ("@");
}
String host = urlObject.getHost();
if (host.endsWith(".")) {
host = host.substring(0, host.length() - 1);
}
urlOut += (host);
urlOut += "/";
return urlOut;
}
void emitURLObject(GoogleURL urlObject,JsonObject originalJSON,float rank,Reporter reporter)throws IOException {
URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
if (fp != null) {
if (emittedTuplesFilter.isPresent(fp)) {
reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
} else {
reporter.incrCounter(Counters.EMITTING_URL_OBJECT, 1);
emittedTuplesFilter.add(fp);
urlsInFilter++;
SegmentGeneratorItem itemValue = new SegmentGeneratorItem();
itemValue.setDomainFP(fp.getDomainHash());
itemValue.setRootDomainFP(fp.getRootDomainHash());
itemValue.setUrlFP(fp.getUrlHash());
itemValue.setUrl(urlObject.getCanonicalURL());
itemValue.setPageRank(rank);
itemValue.setModifiedStatus((byte) 0);
if (originalJSON != null) {
if (originalJSON.has(CrawlListGenCommon.CRAWLLIST_METADATA_ETAG)) {
itemValue.setEtag(originalJSON.get(CrawlListGenCommon.CRAWLLIST_METADATA_ETAG).getAsString());
}
if (originalJSON.has(CrawlListGenCommon.CRAWLLIST_METADATA_LAST_MODIFIED_TIME)){
itemValue.setLastModifiedTime(originalJSON.get(CrawlListGenCommon.CRAWLLIST_METADATA_LAST_MODIFIED_TIME).getAsLong());
}
}
items.add(itemValue);
if (items.size() >= SPILL_THRESHOLD)
spillItems(reporter);
}
} else {
reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
}
}
void emitURL(String url,float rank, JsonObject originalJSON, Reporter reporter)throws IOException {
GoogleURL urlObject = new GoogleURL(url);
if (urlObject.isValid()) {
String scheme = urlObject.getScheme().toLowerCase();
if (!validSchemes.contains(scheme)) {
reporter.incrCounter(Counters.INVALID_SCHEME, 1);
}
else {
if (_genHomePageURLForSubDomain) {
reporter.incrCounter(Counters.GENERATING_HOME_PAGE_URL, 1);
_genHomePageURLForSubDomain = false;
// generate homepage url ...
String homePageURL = makeHomePageURLFromUrlObject(urlObject);
if (homePageURL != null) {
GoogleURL homePageURLObj = new GoogleURL(homePageURL);
if (homePageURLObj.isValid()) {
emitURLObject(homePageURLObj, null, 10000.00f, reporter);
}
}
}
emitURLObject(urlObject, originalJSON, rank, reporter);
}
}
else {
reporter.incrCounter(Counters.INVALID_URL_OBJECT, 1);
}
}
@Override
public void reduce(IntWritable key,Iterator<Text> values,OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> output,Reporter reporter) throws IOException {
// set up merge attributes
Configuration localMergeConfig = new Configuration(_conf);
localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlListKey.CrawListKeyComparator.class,
RawComparator.class);
localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, CrawlListKey.class, WritableComparable.class);
// ingest input paths .
Pair<FileSystem,List<Path>> fileSystemPathTuple = buildInputPathList(localMergeConfig, values);
RawValueIterator rawValueIterator = new RawValueIterator();
JsonParser parser = new JsonParser();
_collector = output;
// startup merger ...
LOG.info("FileSystem is:" + fileSystemPathTuple.e0);
LOG.info("Merger Input Paths are:" + fileSystemPathTuple.e1);
MultiFileInputReader<CrawlListKey> multiFileInputReader = new MultiFileInputReader<CrawlListKey>(fileSystemPathTuple.e0, fileSystemPathTuple.e1, localMergeConfig);
try {
Pair<KeyAndValueData<CrawlListKey>,Iterable<RawRecordValue>> nextItem = null;
// walk tuples and feed them to the actual reducer ...
while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
reporter.incrCounter(Counters.GET_NEXT_RECORD_FROM_MERGER, 1);
// check the current domain id to see if need to do a domain transition
long newRootDomainId = nextItem.e0._keyObject.partitionDomainKey;
if (!_currentRootDomainIdValid || newRootDomainId != _currentRootDomainId) {
// domain transition detected ...
rootDomainTransition(newRootDomainId, reporter);
}
long newSubDomainId = nextItem.e0._keyObject.comparisonDomainKey;
// now check for subdomain transition ...
if (!_currentSubDomainIdValid || newSubDomainId != _currentSubDomainId) {
_currentSubDomainId = newSubDomainId;
_genHomePageURLForSubDomain = true;
}
// reset values iterator ...
rawValueIterator.reset(nextItem.e1);
while (rawValueIterator.hasNext()) {
reporter.incrCounter(Counters.GOT_RAW_RECORD_FROM_ITERATOR, 1);
//LOG.info("Got Record From Source:" + rawValueIterator.currentSource);
String json = rawValueIterator.next().toString();
JsonObject jsonObj = parser.parse(json).getAsJsonObject();
if (jsonObj.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) {
emitURL(
jsonObj.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString(),
(float)rawValueIterator.currentKey().rank0,
jsonObj,
reporter);
}
else {
reporter.incrCounter(Counters.NO_SOURCE_URL_IN_JSON, 1);
}
}
reporter.progress();
}
// flush trailing domain
rootDomainTransition(Long.MAX_VALUE, reporter);
}
finally {
multiFileInputReader.close();
}
}
}
}