package org.commoncrawl.mapred.segmenter;
import java.io.IOException;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
public class SegmentMover {
private static final Log LOG = LogFactory.getLog(SegmentMover.class);
private static long findLatestDatabaseTimestamp(Path rootPath) throws IOException {
FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*"));
long candidateTimestamp = -1L;
for (FileStatus candidate : candidates) {
LOG.info("Found Seed Candidate:" + candidate.getPath());
long timestamp = Long.parseLong(candidate.getPath().getName());
if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
candidateTimestamp = timestamp;
}
}
LOG.info("Selected Candidate is:" + candidateTimestamp);
return candidateTimestamp;
}
public static class CrawlSegmentFile implements Comparable<CrawlSegmentFile> {
public CrawlSegmentFile(Path location, long partitionId,long segmentId) {
this.partitionId = partitionId;
this.segmentId = segmentId;
this.location = location;
}
public Path location = null;
public long partitionId = -1;
public long segmentId = -1;
@Override
public int compareTo(CrawlSegmentFile o) {
int result = ((Long)segmentId).compareTo(o.segmentId);
if (result == 0) {
result = ((Long)partitionId).compareTo(o.partitionId);
}
return result;
}
}
public static void main(String[] args)throws IOException {
Configuration conf = new Configuration();
int crawlNumber = Integer.parseInt(args[0]);
CrawlEnvironment.setHadoopConfig(conf);
FileSystem fs = FileSystem.get(conf);
Path basePath = new Path("crawl/reports/crawllistgen/segments/");
long databaseId = findLatestDatabaseTimestamp(basePath);
if (databaseId != -1) {
LOG.info("Found Database Id:" + databaseId);
Path latestPath = new Path(basePath,Long.toString(databaseId));
// collect crawlers
// ArrayList<String> crawlers = new ArrayList<String>();
FileStatus crawlers[] = fs.globStatus(new Path(latestPath,"ccc*"));
for (FileStatus crawlerCandidate : crawlers) {
String crawlerName = crawlerCandidate.getPath().getName();
LOG.info("Found Crawler:" + crawlerName);
TreeSet<CrawlSegmentFile> fileSet = new TreeSet<CrawlSegmentFile>();
FileStatus partitions[] = fs.globStatus(new Path(crawlerCandidate.getPath(),"*"));
LOG.info("Doing Segment Scan");
for (FileStatus partition : partitions) {
FileStatus segments[] = fs.globStatus(new Path(partition.getPath(),"*"));
for (FileStatus segment :segments) {
CrawlSegmentFile fileCandidate
= new CrawlSegmentFile(
segment.getPath(),
Long.parseLong(partition.getPath().getName()),
Long.parseLong(segment.getPath().getName()));
fileSet.add(fileCandidate);
}
}
int segmentId = 0;
for (CrawlSegmentFile file : fileSet) {
Path sourcePath = file.location;
Path destPath
= new Path(
"crawl/crawl_segments/"
+ crawlNumber
+ "/"
+ (segmentId++)
+ "/"
+ crawlerName);
LOG.info("Moving:" + sourcePath + " to:" + destPath);
fs.mkdirs(destPath.getParent());
fs.rename(sourcePath, destPath);
}
}
}
}
}