/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.crawllistgen; import java.io.IOException; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.segmenter.SegmentMover.CrawlSegmentFile; /** * * @author rana * */ public class MoveSegmentsStep extends CrawlPipelineStep { public static final String OUTPUT_DIR_NAME = "moverStub"; private static final Log LOG = LogFactory.getLog(MoveSegmentsStep.class); public MoveSegmentsStep(CrawlPipelineTask task) { super(task, "Segment Mover", OUTPUT_DIR_NAME); } public int findNextSegmentId() throws IOException { int maxSegmentId = 100; FileStatus candidates[] = getFileSystem().globStatus(new Path("crawl/crawl_segments/[0-9]*")); for (FileStatus candidate : candidates) { try { int segmentId = Integer.parseInt(candidate.getPath().getName()); maxSegmentId = Math.max(maxSegmentId, segmentId); } catch (Exception e) { } } return maxSegmentId + 1; } @Override public Log getLogger() { return LOG; } @Override public void runStep(Path outputPathLocation) throws IOException { int crawlNumber = findNextSegmentId(); Path latestPath = getOutputDirForStep(GenSegmentsStep.class); // collect crawlers // ArrayList<String> crawlers = new ArrayList<String>(); FileStatus crawlers[] = getFileSystem().globStatus(new Path(latestPath, "ccc*")); for (FileStatus crawlerCandidate : crawlers) { String crawlerName = crawlerCandidate.getPath().getName(); LOG.info("Found Crawler:" + crawlerName); TreeSet<CrawlSegmentFile> fileSet = new TreeSet<CrawlSegmentFile>(); FileStatus partitions[] = getFileSystem().globStatus(new Path(crawlerCandidate.getPath(), "*")); LOG.info("Doing Segment Scan"); for (FileStatus partition : partitions) { FileStatus segments[] = getFileSystem().globStatus(new Path(partition.getPath(), "*")); for (FileStatus segment : segments) { CrawlSegmentFile fileCandidate = new CrawlSegmentFile(segment.getPath(), Long.parseLong(partition.getPath() .getName()), Long.parseLong(segment.getPath().getName())); fileSet.add(fileCandidate); } } int segmentId = 0; for (CrawlSegmentFile file : fileSet) { Path sourcePath = file.location; Path destPath = new Path("crawl/crawl_segments/" + crawlNumber + "/" + (segmentId++) + "/" + crawlerName); LOG.info("Moving:" + sourcePath + " to:" + destPath); getFileSystem().mkdirs(destPath.getParent()); getFileSystem().rename(sourcePath, destPath); } } // fake out the output path getFileSystem().mkdirs(outputPathLocation); } }