MoveSegmentsStep.java example

Explorer

commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl

/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.pipelineV3.crawllistgen;

import java.io.IOException;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.segmenter.SegmentMover.CrawlSegmentFile;

/**
 * 
 * @author rana
 *
 */
public class MoveSegmentsStep extends CrawlPipelineStep {

  public static final String OUTPUT_DIR_NAME = "moverStub";

  private static final Log LOG = LogFactory.getLog(MoveSegmentsStep.class);

  public MoveSegmentsStep(CrawlPipelineTask task) {
    super(task, "Segment Mover", OUTPUT_DIR_NAME);
  }

  public int findNextSegmentId() throws IOException {
    int maxSegmentId = 100;
    FileStatus candidates[] = getFileSystem().globStatus(new Path("crawl/crawl_segments/[0-9]*"));
    for (FileStatus candidate : candidates) {
      try {
        int segmentId = Integer.parseInt(candidate.getPath().getName());
        maxSegmentId = Math.max(maxSegmentId, segmentId);
      } catch (Exception e) {

      }
    }
    return maxSegmentId + 1;
  }

  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {

    int crawlNumber = findNextSegmentId();

    Path latestPath = getOutputDirForStep(GenSegmentsStep.class);

    // collect crawlers
    // ArrayList<String> crawlers = new ArrayList<String>();
    FileStatus crawlers[] = getFileSystem().globStatus(new Path(latestPath, "ccc*"));
    for (FileStatus crawlerCandidate : crawlers) {

      String crawlerName = crawlerCandidate.getPath().getName();
      LOG.info("Found Crawler:" + crawlerName);

      TreeSet<CrawlSegmentFile> fileSet = new TreeSet<CrawlSegmentFile>();

      FileStatus partitions[] = getFileSystem().globStatus(new Path(crawlerCandidate.getPath(), "*"));

      LOG.info("Doing Segment Scan");
      for (FileStatus partition : partitions) {
        FileStatus segments[] = getFileSystem().globStatus(new Path(partition.getPath(), "*"));

        for (FileStatus segment : segments) {
          CrawlSegmentFile fileCandidate = new CrawlSegmentFile(segment.getPath(), Long.parseLong(partition.getPath()
              .getName()), Long.parseLong(segment.getPath().getName()));
          fileSet.add(fileCandidate);
        }
      }

      int segmentId = 0;
      for (CrawlSegmentFile file : fileSet) {
        Path sourcePath = file.location;
        Path destPath = new Path("crawl/crawl_segments/" + crawlNumber + "/" + (segmentId++) + "/" + crawlerName);
        LOG.info("Moving:" + sourcePath + " to:" + destPath);
        getFileSystem().mkdirs(destPath.getParent());
        getFileSystem().rename(sourcePath, destPath);
      }
    }
    // fake out the output path
    getFileSystem().mkdirs(outputPathLocation);
  }
}