/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.pagerank; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.EOFException; import java.io.IOException; import java.text.NumberFormat; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.VIntWritable; import org.apache.hadoop.io.WritableUtils; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.protocol.URLFP; import org.commoncrawl.util.CCStringUtils; public class PageRankValueReWriter { private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } public static final Log LOG = LogFactory.getLog(PageRankValueReWriter.class); public static void main(String[] args) { int nodeIndex = Integer.parseInt(args[0]); LOG.info("Node Index:" + args[0]); int nodeCount = Integer.parseInt(args[1]); LOG.info("Node Count:" + args[1]); String idsDirectory = args[2]; LOG.info("ID Directory is:" + args[2]); String valuesDirectory = args[3]; LOG.info("Values Directory is:" + args[3]); int iterationNumber = Integer.parseInt(args[4]); LOG.info("Iteration Number is:" + args[4]); int runDate = Integer.parseInt(args[5]); LOG.info("runDate is:" + args[5]); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("hadoop-default.xml"); conf.addResource("hadoop-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); try { FileSystem fileSystem = CrawlEnvironment.getDefaultFileSystem(); Path outputPath = new Path("crawl/pageRank/out",Integer.toString(runDate)); LOG.info("Output Directory is:"+ outputPath); fileSystem.mkdirs(outputPath); //iterate values based on node id for (int i=nodeIndex;i<nodeIndex+1;++i) { LOG.info("Processing output for Node:" + i); Path valuePath = new Path(valuesDirectory,"value_"+ NUMBER_FORMAT.format(iterationNumber) + "-" + NUMBER_FORMAT.format(i)); LOG.info("Value File Path is:" + valuePath); Path idsPath = new Path(idsDirectory,"ids_"+ NUMBER_FORMAT.format(i)); LOG.info("IDs File Path is:" + idsPath); Path outputFile = new Path(outputPath,"part-" + NUMBER_FORMAT.format(i)); LOG.info("Output File Path is:" + outputFile); byte[] valueData = null; { FileStatus valueFileStatus = fileSystem.getFileStatus(valuePath); FSDataInputStream valueInputStream = fileSystem.open(valuePath); LOG.info("Allocating Value Array of Size:" + valueFileStatus.getLen()); valueData = new byte[(int) valueFileStatus.getLen()]; LOG.info("Reading Value Data Size:" + valueFileStatus.getLen()); for (int offset=0,totalRead=0;offset<valueFileStatus.getLen();) { int bytesToRead = Math.min(16384,(int)valueFileStatus.getLen() - totalRead); valueInputStream.read(valueData,offset,bytesToRead); offset+= bytesToRead; totalRead += bytesToRead; } valueInputStream.close(); LOG.info("Finished Reading Value Data Size:" + valueFileStatus.getLen()); } byte[] idData = null; { FileStatus idFileStatus = fileSystem.getFileStatus(idsPath); FSDataInputStream idInputStream = fileSystem.open(idsPath); LOG.info("Allocating ID Array of Size:" + idFileStatus.getLen()); idData = new byte[(int) idFileStatus.getLen()]; LOG.info("Reading ID Array Data Size:" + idFileStatus.getLen()); for (int offset=0,totalRead=0;offset<idFileStatus.getLen();) { int bytesToRead = Math.min(16384,(int)idFileStatus.getLen() - totalRead); idInputStream.read(idData,offset,bytesToRead); offset+= bytesToRead; totalRead += bytesToRead; } idInputStream.close(); LOG.info("Finished Reading ID Array Data Size:" + idFileStatus.getLen()); } DataInputStream idInputStream = new DataInputStream(new ByteArrayInputStream(idData)); DataInputStream valueInputStream = new DataInputStream(new ByteArrayInputStream(valueData)); SequenceFile.Writer output = SequenceFile.createWriter(fileSystem,conf,outputFile,URLFP.class,VIntWritable.class); LOG.info("Opened Output Stream"); URLFP currentFP = new URLFP(); boolean eof = false; int itemCount = 0; while (!eof) { try { long timeStart = System.currentTimeMillis(); currentFP.readFields(idInputStream); long timeEnd = System.currentTimeMillis(); // LOG.info("ReadFields Took:" + (timeEnd - timeStart)); ++itemCount; } catch (EOFException e) { LOG.info("EOF reached. Total Item Count:" + itemCount); eof = true; } if (!eof) { long valueFingerprint = WritableUtils.readVLong(valueInputStream); int prValue = valueInputStream.readInt(); if (valueFingerprint != currentFP.getUrlHash()) { throw new IOException("Fingerprint Mismatch! Expected:" + currentFP.getUrlHash() + " Found:" + valueFingerprint + " ItemCount:" + itemCount); } output.append(currentFP, new VIntWritable(prValue)); if (itemCount % 10000 == 0) { LOG.info("Processed " + itemCount + " Values. Last Sampled FP:" + valueFingerprint + " With PR:" + prValue); } currentFP = new URLFP(); } } LOG.info("Done outputing pagerank for Node:" + i + " ItemCount:" + itemCount); valueInputStream.close(); idInputStream.close(); output.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }