/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.SequenceFile.ValueBytes; import org.apache.hadoop.io.compress.SnappyCodec; import org.commoncrawl.protocol.CrawlURL; /** * * @author rana * */ public class CrawlLogSplitter { public static final Log LOG = LogFactory.getLog(CrawlLogSplitter.class); final static Pattern crawlLogRegExp = Pattern.compile("CrawlLog_ccc[0-9]{2}-[0-9]{2}_([0-9]*)"); final static TreeSet<Path> candidateList = new TreeSet<Path>(new Comparator<Path>() { @Override public int compare(Path p1, Path p2) { String n1 = p1.getName(); String n2 = p2.getName(); Matcher m1 = crawlLogRegExp.matcher(n1); Matcher m2 = crawlLogRegExp.matcher(n2); m1.matches(); m2.matches(); Long v1 = Long.parseLong(m1.group(1)); Long v2 = Long.parseLong(m2.group(1)); return v1.compareTo(v2); } }); static Pattern crawlLogRegExp2 = Pattern.compile("CrawlLog_ccc([0-9]{2})-([0-9]{2})_([0-9]*)"); static Path buildIncrementalPathGivenPathAndIndex(Path tempDir,String baseName,int index)throws IOException { Matcher m = crawlLogRegExp2.matcher(baseName); if (m.matches()) { return new Path(tempDir,"CrawlLog_ccc"+ m.group(1)+"-"+m.group(2)+"_"+(Long.parseLong(m.group(3)) + (index + 1))); } throw new IOException("Invalid Base Name:" + baseName); } static final long SPLIT_SIZE = 5368709120L; static final long IDEAL_SIZE = SPLIT_SIZE / 2; public static void main(String[] args)throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*")); for (FileStatus candidate : arcFiles) { if (candidate.getLen() > SPLIT_SIZE) { candidateList.add(candidate.getPath()); } } LOG.info("Found:" + candidateList.size() + " oversized candidates"); Path tempOutputDir = new Path(conf.get("mapred.temp.dir", ".")); while (candidateList.size() != 0) { Path candidateName = candidateList.first(); candidateList.remove(candidateName); LOG.info("Processing Candidate:" + candidateName); long fileSize = fs.getFileStatus(candidateName).getLen(); //get crawl log filename components ArrayList<Path> splitItems = new ArrayList<Path>(); int index = 0; Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,candidateName.getName(),index); LOG.info("Initial Output Path is:"+ outputPart); fs.delete(outputPart,false); // create reader SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf); ValueBytes sourceVB = reader.createValueBytes(); DataOutputBuffer sourceKeyData = new DataOutputBuffer(); try { // ok create temp file SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf,outputPart, Text.class, CrawlURL.class,CompressionType.BLOCK,new SnappyCodec()); // add to split items array splitItems.add(outputPart); try { long recordsWritten = 0; while (reader.nextRawKey(sourceKeyData) != -1) { reader.nextRawValue(sourceVB); long lengthPreWrite = activeWriter.getLength(); activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB); if (++recordsWritten % 10000 == 0) { LOG.info("Write 10000 records"); } long lengthPostWrite = activeWriter.getLength(); if (lengthPostWrite != lengthPreWrite) { if (lengthPostWrite >= IDEAL_SIZE) { LOG.info("Hit Split Point. Flushing File:" + outputPart); activeWriter.close(); outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,candidateName.getName(),++index); LOG.info("Creating New File:" + outputPart); activeWriter = SequenceFile.createWriter(fs, conf,outputPart, Text.class, CrawlURL.class,CompressionType.BLOCK,new SnappyCodec()); splitItems.add(outputPart); } } sourceKeyData.reset(); } } finally { activeWriter.close(); } } finally { reader.close(); } LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files"); for (Path splitItem : splitItems) { Path destPath = new Path("crawl/checkpoint_data",splitItem.getName()); LOG.info("Moving:" + splitItem + " to:" + destPath); fs.rename(splitItem,destPath); } Path sourceMoveLocation = new Path("crawl/checkpoint_data_split",candidateName.getName()); LOG.info("Moving SOURCE:" + candidateName + " to:"+ sourceMoveLocation); fs.rename(candidateName,sourceMoveLocation); } } }