/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.pagerank.slave; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.RandomAccessFile; import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedList; import java.util.Map; import java.util.TreeMap; import java.util.Vector; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.zip.CRC32; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.SequenceFile.Metadata; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.DefaultCodec; import org.commoncrawl.async.CallbackWithResult; import org.commoncrawl.async.EventLoop; import org.commoncrawl.async.Timer; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.protocol.CompressedOutlinkList; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.rpc.base.internal.AsyncClientChannel; import org.commoncrawl.rpc.base.internal.AsyncContext; import org.commoncrawl.rpc.base.internal.AsyncRequest; import org.commoncrawl.rpc.base.internal.AsyncServerChannel; import org.commoncrawl.rpc.base.internal.NullMessage; import org.commoncrawl.rpc.base.internal.Server; import org.commoncrawl.rpc.base.internal.AsyncRequest.Callback; import org.commoncrawl.rpc.base.internal.AsyncRequest.Status; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.crawler.filters.SuperDomainFilter; import org.commoncrawl.service.crawler.filters.Filter.FilterResult; import org.commoncrawl.service.pagerank.BaseConfig; import org.commoncrawl.service.pagerank.BeginPageRankInfo; import org.commoncrawl.service.pagerank.BlockTransfer; import org.commoncrawl.service.pagerank.BlockTransferAck; import org.commoncrawl.service.pagerank.CheckpointInfo; import org.commoncrawl.service.pagerank.Constants; import org.commoncrawl.service.pagerank.FileInfo; import org.commoncrawl.service.pagerank.IterationInfo; import org.commoncrawl.service.pagerank.PRRangeItem; import org.commoncrawl.service.pagerank.PageRankSlave; import org.commoncrawl.service.pagerank.SlaveStatus; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FileUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.JVMStats; import org.commoncrawl.util.URLUtils; import org.junit.Test; import com.google.common.collect.ImmutableList; import com.hadoop.compression.lzo.LzoCodec; public class PageRankUtils { // TODO:HACK public static final int VALUES_PER_RANGE = 10; public static final Log LOG = LogFactory.getLog(PageRankUtils.class); public static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } private static String outlinkValuesFilePrefix = "OutlinkPR"; public static Path getCheckpointFilePath(Path jobPath,int iterationPhase,int iterationNumnber,int nodeIndex) { String fileName = IterationInfo.Phase.toString(iterationPhase) + "-CheckpointComplete-"+ NUMBER_FORMAT.format(iterationNumnber) + "-" + NUMBER_FORMAT.format(nodeIndex); return new Path(jobPath,fileName); } public static String makeUniqueFileName(String fileNamePrefix,int iterationNumber,int nodeIndex) { if (iterationNumber == 0) { return fileNamePrefix + NUMBER_FORMAT.format(nodeIndex); } else { return fileNamePrefix + NUMBER_FORMAT.format(iterationNumber) + "-" + NUMBER_FORMAT.format(nodeIndex); } } public static File makeIdsFilePath(File basePath, int nodeIndex) { return new File(basePath,PageRankUtils.makeUniqueFileName(Constants.PR_IDS_FILE_PREFIX,0,nodeIndex)); } public static Path makeRangeFilePath(File basePath, int nodeIndex) { return new Path(basePath.getAbsolutePath(),PageRankUtils.makeUniqueFileName(Constants.PR_RANGE_FILE_PREFIX,0,nodeIndex)); } public static String getOutlinksBaseName(int myNodeIdx,int iterationNumber) { return outlinkValuesFilePrefix + "-" + NUMBER_FORMAT.format(iterationNumber) + "-" + NUMBER_FORMAT.format(myNodeIdx); } private static int readVIntFromByteBuffer(ByteBuffer source) { return (int) readVLongFromByteBuffer(source); } private static long readVLongFromByteBuffer(ByteBuffer source) { byte firstByte = source.get(); int len = WritableUtils.decodeVIntSize(firstByte); if (len == 1) { return firstByte; } long i = 0; for (int idx = 0; idx < len-1; idx++) { byte b = source.get(); i = i << 8; i = i | (b & 0xFF); } return (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i); } public static final class PRValueMap { private static final int RANGE_ITEM_SIZE = 20; private static final int RANGE_FP_OFFSET = 8; private static final int RANGE_POS_OFFSET = 16; private File rangeFilePath = null; private ByteBuffer valueFileBuffer = null; private ByteBuffer rangeFileBuffer = null; private int rangeItemCount = 0; public PRValueMap() { } public void open(FileSystem fs,Path valueFilePath,Path rangeFilePath)throws IOException { LOG.info("OPENING PRValueMap - Available Memory:" + Runtime.getRuntime().freeMemory() + " TotalMemory:" + Runtime.getRuntime().totalMemory()) ; FileStatus valueFileStatus = fs.getFileStatus(valueFilePath); FileStatus rangeFileStatus = fs.getFileStatus(rangeFilePath); if (valueFileStatus == null) { LOG.error("Value File at Path:" + valueFilePath + " not Found!"); throw new FileNotFoundException(); } if (rangeFileStatus == null) { LOG.error("Range File at Path:" + rangeFilePath + " not Found!"); throw new FileNotFoundException(); } FSDataInputStream valueFile = null; FSDataInputStream rangeFile = null; try { LOG.info("Create R/W Random Access File for values Path:" + valueFilePath); valueFile = fs.open(valueFilePath); LOG.info("Create R-ONLY Random Access File for range Path:" + rangeFilePath); rangeFile = fs.open(rangeFilePath); LOG.info("Allocating R/W Buffer of Size:" + valueFileStatus.getLen() + " for Value File" + " Available Memory:" + Runtime.getRuntime().freeMemory()); JVMStats.dumpMemoryStats(); byte [] valueMapData = new byte[(int)valueFileStatus.getLen()]; //this.valueFileBuffer = ByteBuffer.allocate((int) valueFileStatus.getLen() ); this.valueFileBuffer = ByteBuffer.wrap(valueMapData); LOG.info("Loading R/W Buffer From Value File"); long loadStart = System.currentTimeMillis(); for (int offset=0,totalRead=0;offset<valueFileBuffer.capacity();) { int bytesToRead = Math.min(16384,valueFileBuffer.capacity() - totalRead); valueFile.read(valueFileBuffer.array(),offset,bytesToRead); offset+= bytesToRead; totalRead += bytesToRead; } LOG.info("Load of Value File Buffer Took:" + (System.currentTimeMillis() - loadStart) + " MS"); LOG.info("Mapping R-ONLY Buffer of Size:" + rangeFileStatus.getLen() + " for Range File"); this.rangeFileBuffer = ByteBuffer.allocate((int) rangeFileStatus.getLen() ); LOG.info("Loading RangeFile Buffer From Range File"); loadStart = System.currentTimeMillis(); for (int offset=0,totalRead=0;offset<rangeFileBuffer.capacity();) { int bytesToRead = Math.min(16384,rangeFileBuffer.capacity() - totalRead); rangeFile.read(rangeFileBuffer.array(),offset,bytesToRead); offset+= bytesToRead; totalRead += bytesToRead; } LOG.info("Load of Range File Buffer Took:" + (System.currentTimeMillis() - loadStart) + " MS"); // calculate range item count rangeItemCount = (int)rangeFileStatus.getLen() / RANGE_ITEM_SIZE; } finally { if (valueFile != null) valueFile.close(); if (rangeFile != null) rangeFile.close(); } } void flush(OutputStream stream) throws IOException { if (valueFileBuffer != null) { LOG.info("Flushing valueBuffer"); LOG.info("Accessing underlying ByteArray"); valueFileBuffer.position(0); byte array[] = valueFileBuffer.array(); long timeStart = System.currentTimeMillis(); stream.write(array); long timeEnd = System.currentTimeMillis(); LOG.info("ValueBuffer Flush took:" + (timeEnd-timeStart) + " Milliseconds - valueBufferSize:" + valueFileBuffer.limit()); } } void close() throws IOException { LOG.info("CLOSING PRValueMap"); valueFileBuffer = null; rangeFileBuffer = null; } enum GetSetOPType { GET, SET, ADD } public final float getPRValue(URLFPV2 urlItem) throws IOException { return getSetPRValue(urlItem, GetSetOPType.GET, 0.0f); } public final void setPRValue(URLFPV2 urlItem, float value) throws IOException { getSetPRValue(urlItem, GetSetOPType.SET, value); } public final void addPRValue(URLFPV2 urlItem, float value) throws IOException { getSetPRValue(urlItem, GetSetOPType.ADD, value); } public void zeroValues()throws IOException { valueFileBuffer.position(0); while (valueFileBuffer.position() < valueFileBuffer.limit()) { valueFileBuffer.getLong(); //TODO: SWITCH TO INT FOR TEST // valueFileBuffer.putShort((short)0); valueFileBuffer.putFloat(0.0f); } } // TODO: SWITCH TO INT FOR TEST // static Map<Long,Short> debugMap = new TreeMap<Long,Short>(); static Map<Long,Float> debugMap = new TreeMap<Long,Float>(); public void finalizePageRank()throws IOException { valueFileBuffer.position(0); int itemCount = 0; while (valueFileBuffer.position() < valueFileBuffer.limit()) { long fingerprint = valueFileBuffer.getLong(); valueFileBuffer.mark(); //TODO: SWITCH TO INT FOR TEST // int accumulatedRank = valueFileBuffer.getShort(); float accumulatedRank = valueFileBuffer.getFloat(); // TODO: hack use default pr formula for now ... float finalRank = (.150f + (.85f * (float)accumulatedRank)); valueFileBuffer.reset(); valueFileBuffer.putFloat(finalRank); } } final float getSetPRValue(URLFPV2 urlItem,GetSetOPType opType,float valueIn) throws IOException{ //long timeStart = System.currentTimeMillis(); int rangeIdx = findRangePosition(urlItem); //long timeEnd = System.currentTimeMillis(); if (rangeIdx == -1) { throw new IOException("Unable to locate PR Value for domain:" + urlItem.getDomainHash() + " fingerprint:" + urlItem.getUrlHash()); } //DBG if (1 == 0) { URLFPV2 rangeFP = new URLFPV2(); populateFPForRange(rangeFileBuffer,rangeFP, rangeIdx); //LOG.info("Range for Domain:" + urlItem.getDomainHash() + " FP:" + urlItem.getUrlHash() + " is Domain:" + rangeFP.getDomainHash() + " FP:" + rangeFP.getUrlHash() ); } //get the search start positon via the range int rangeOffset = rangeFileBuffer.getInt(rangeIdx*RANGE_ITEM_SIZE + RANGE_POS_OFFSET); // now start walking items in range ... //LOG.info("RangeOffset for domain:" + urlItem.getDomainHash() + " fingerprint:" + urlItem.getUrlHash() + " is:" + rangeOffset); // seek to range offset ... valueFileBuffer.position(rangeOffset); //timeStart = System.currentTimeMillis(); // walk up to max number of items in range ... for (int itemIdx=0;itemIdx<VALUES_PER_RANGE;++itemIdx) { // read the urlf fp ... long urlFPValue = valueFileBuffer.getLong(); if (urlItem.getUrlHash() == urlFPValue) { //timeEnd = System.currentTimeMillis(); ///LOG.info("Scan took:" + (timeEnd-timeStart)); if (opType == GetSetOPType.SET) { valueFileBuffer.putFloat(valueIn); return 0; } else if (opType == GetSetOPType.GET) { return valueFileBuffer.getFloat(); } else { // ADD valueFileBuffer.mark(); float value = valueFileBuffer.getFloat(); valueFileBuffer.reset(); valueFileBuffer.putFloat((Math.min(value + valueIn,Float.MAX_VALUE))); return 0; } } // otherwise skip the value ... else { valueFileBuffer.getFloat(); } // if we reached trailing end of buffer ... we are done if(valueFileBuffer.remaining() == 0) { throw new IOException("Reached end of Value Buffer Looking for Value"); } } //this is bad news... dump context info for debug purposes before throwing exception LOG.error("Reached End of Range looking for PRValue for FP:"+ urlItem.getUrlHash()); URLFPV2 rangeFPDBG = new URLFPV2(); populateFPForRange(rangeFileBuffer,rangeFPDBG, rangeIdx); LOG.error("Closest Range Was Index:" + rangeIdx + " DomainHash:" + rangeFPDBG.getDomainHash() + " URLHash:" + rangeFPDBG.getUrlHash()); if (rangeIdx + 1 < this.rangeItemCount) { populateFPForRange(rangeFileBuffer,rangeFPDBG, rangeIdx + 1); LOG.error("Range At Index:" + (rangeIdx + 1) + " DomainHash:" + rangeFPDBG.getDomainHash() + " URLHash:" + rangeFPDBG.getUrlHash()); } LOG.error("Dumping Next 600 bytes at offset:" + rangeOffset); /* // re-seek to range offset ... valueFileBuffer.position(rangeOffset); LOG.error("\n" + dumpAsHex(valueFileBuffer, Math.min(600,valueFileBuffer.remaining()))); */ LOG.error("Dumping Values:"); // re-seek to range offset ... valueFileBuffer.position(rangeOffset); // walk up to max number of items in range ... for (int itemIdx=0;itemIdx<VALUES_PER_RANGE && valueFileBuffer.remaining() != 0;++itemIdx) { // read the urlf fp ... long urlFPValue = valueFileBuffer.getLong(); // and the value float value = valueFileBuffer.getFloat(); LOG.error("Item:" + itemIdx +" FP:" + urlFPValue + " Value:" + value); } LOG.error("Dump Complete"); throw new IOException("Reached the End of Range looking for designated PRValue"); } private static final int HEX_CHARS_PER_LINE = 32; public String dumpAsHex(ByteBuffer data,int amount) { StringBuffer buf = new StringBuffer(amount << 1) ; int k = 0 ; int flen = amount; char hexBuffer[] = new char[HEX_CHARS_PER_LINE*2 + (HEX_CHARS_PER_LINE - 1)+ 2]; char asciiBuffer[] = new char[HEX_CHARS_PER_LINE + 1]; hexBuffer[hexBuffer.length-1]=0; asciiBuffer[asciiBuffer.length - 1]=0; for (int i = 0; i < flen ; i++) { int j = data.get() & 0xFF ; hexBuffer[k*3] = Character.forDigit((j >>> 4) , 16); hexBuffer[k*3+1] = Character.forDigit((j & 0x0F), 16); hexBuffer[k*3+2] = ' '; if (j<0x20) asciiBuffer[k] = '.'; else if (k < 0x78) asciiBuffer[k] = (char)j; else asciiBuffer[k] = '?'; k++ ; if (k % HEX_CHARS_PER_LINE == 0) { hexBuffer[hexBuffer.length-2] = 0; buf.append(hexBuffer); buf.append(" "); buf.append(asciiBuffer); buf.append('\n') ; k = 0 ; } } if (k != 0) { hexBuffer[k*3 + 1] = 0; asciiBuffer[k] = 0; buf.append(hexBuffer); buf.append(" "); buf.append(asciiBuffer); buf.append('\n') ; } return buf.toString() ; } int getRangeOffsetFromRangeIndex(int rangeIndex) { return rangeFileBuffer.getInt(rangeIndex*RANGE_ITEM_SIZE + RANGE_POS_OFFSET); } static final void populateFPForRange(ByteBuffer sourceBuffer, URLFPV2 placeHolder,int rangeIndex) { placeHolder.setDomainHash(sourceBuffer.getLong(rangeIndex*RANGE_ITEM_SIZE)); placeHolder.setUrlHash(sourceBuffer.getLong(rangeIndex*RANGE_ITEM_SIZE + RANGE_FP_OFFSET)); } final int findRangePosition(URLFPV2 searchTerm) { long searchDomainHash = searchTerm.getDomainHash(); long searchURLHash = searchTerm.getUrlHash(); int low = 0; int high = rangeItemCount - 1; while (low <= high) { int mid = low + ((high - low) / 2); long currentDomainHash = rangeFileBuffer.getLong(mid*RANGE_ITEM_SIZE); int result = (currentDomainHash<searchDomainHash ? -1 : (currentDomainHash==searchDomainHash ? 0 : 1)); if (result == 0) { long currentURLHash = rangeFileBuffer.getLong(mid*RANGE_ITEM_SIZE + RANGE_FP_OFFSET); result = (currentURLHash<searchURLHash ? -1 : (currentURLHash==searchURLHash ? 0 : 1)); } //comparisonFP.setDomainHash(rangeFileBuffer.getLong(mid*RANGE_ITEM_SIZE)); //comparisonFP.setUrlHash(rangeFileBuffer.getLong(mid*RANGE_ITEM_SIZE + RANGE_FP_OFFSET)); // populateFPForRange(rangeFileBuffer, comparisonFP, mid); if (result > 0) high = mid - 1; else if (result < 0) low = mid + 1; else return mid; // found } if (high < rangeItemCount) return high; return -1; // not found } void dumpRangeItems() { RandomAccessFile rangeFileObj = null; PRRangeItem item = new PRRangeItem(); try { rangeFileObj = new RandomAccessFile(rangeFilePath,"r"); for (int i=0;i<rangeItemCount;++i) { item.clear(); item.readFields(rangeFileObj); LOG.info("Range Item:" + i + " Domain:" + item.getDomainStart() + " FPStart:" + item.getUrlFPStart() + " Offset:" + item.getStartPos()); } rangeFileBuffer.position(0); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { if (rangeFileObj != null) { try { rangeFileObj.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } } static int findPos(int[] array,int searchTerm) { int low = 0; int high = array.length -1; while (low <= high) { int mid = low + ((high - low) / 2); // Note: not (low + high) / 2 !! if (array[mid] > searchTerm) high = mid - 1; else if (array[mid] < searchTerm) low = mid + 1; else return mid; // found } if (high < array.length) return high; return -1; // not found } private static interface PRValueOutputStream { void writePRValue(URLFPV2 targetFP,URLFPV2 sourceFP,float prValue) throws IOException; void close(boolean deleteUnderlyingFile) throws IOException; } private static class PRSequenceFileOutputStream implements PRValueOutputStream { FileSystem _fileSystem; Path _path; SequenceFile.Writer _writer = null; DataOutputBuffer _outputWriter = new DataOutputBuffer(); FlexBuffer _buffer = new FlexBuffer(); public PRSequenceFileOutputStream(Configuration conf,FileSystem fs,Path path) throws IOException { _fileSystem = fs; _path = path; _writer = SequenceFile.createWriter( _fileSystem, conf, path, FlexBuffer.class, NullWritable.class, fs.getConf().getInt("io.file.buffer.size", 4096 * 12), (short)1, fs.getDefaultBlockSize(), CompressionType.BLOCK, new DefaultCodec(), null, new Metadata()); } @Override public void close(boolean deleteUnderlyingFile) throws IOException { _writer.close(); if (deleteUnderlyingFile) { _fileSystem.delete(_path,false); } } @Override public void writePRValue(URLFPV2 target, URLFPV2 source, float prValue) throws IOException { _outputWriter.reset(); _outputWriter.writeLong(target.getDomainHash()); _outputWriter.writeLong(target.getUrlHash()); _outputWriter.writeLong(source.getRootDomainHash()); _outputWriter.writeLong(source.getDomainHash()); _outputWriter.writeLong(source.getUrlHash()); _outputWriter.writeFloat(prValue); _buffer.set(_outputWriter.getData(), 0, _outputWriter.getLength()); _writer.append(_buffer,NullWritable.get()); } } private static class PROldValueOutputStream implements PRValueOutputStream { PROldValueOutputStream(FileSystem fs,Path path)throws IOException { _targetFS = fs; _path = path; _stream = fs.create(path); } public FileSystem _targetFS; public Path _path; // optional path if this is a remote file public FSDataOutputStream _stream; @Override public void close(boolean deleteUnderlyingFile) throws IOException { if (_stream != null){ _stream.flush(); _stream.close(); _stream = null; } if (deleteUnderlyingFile) { _targetFS.delete(_path,false); } } @Override public void writePRValue(URLFPV2 target, URLFPV2 source, float prValue)throws IOException { _stream.writeLong(target.getDomainHash()); _stream.writeLong(target.getUrlHash()); _stream.writeLong(source.getRootDomainHash()); _stream.writeLong(source.getDomainHash()); _stream.writeLong(source.getUrlHash()); _stream.writeFloat(prValue); } } public static void purgeNodeDistributionFilesForIteration(FileSystem remoteFS,String remoteOutputPath,int nodeIndex,int nodeCount,int iterationNumber)throws IOException { String fileNamePrefix = getOutlinksBaseName(nodeIndex,iterationNumber); for (int i=0;i<nodeCount;++i) { // create output filename String fileName = fileNamePrefix + "-" + NUMBER_FORMAT.format(i); Path remotePath = new Path(remoteOutputPath,fileName); LOG.info("Deleting:" + remotePath); remoteFS.delete(remotePath,true); } } /** * PRValueMultiplexer * multiplexes page rank value distribution * across a set of pre-defined nodes * * @author rana * */ public static class PRValueMultiplexer { EventLoop _eventLoop = null; Vector<InetSocketAddress> _slaveAddressList; LinkedList<PRValueBlockWriter> _activeWriters = new LinkedList<PRValueBlockWriter>(); PRValueBlockWriter _failedWriter = null; int _myNodeIndex; Configuration _conf; long _jobId; int _iterationNumber; boolean _failed = false; int _completionCount = 0; int _nodeCount =0; /** * construct a PRValueMultiplexer * * @param conf * @param jobId * @param iterationNumber * @param slaveAddressList * @param myNodeIndex * @throws IOException */ public PRValueMultiplexer(Configuration conf,long jobId,int iterationNumber,Vector<InetSocketAddress> slaveAddressList,int myNodeIndex) throws IOException { LOG.info("PRValueMultiplexer initialized. SlaveAddress List Size:" + slaveAddressList.size() + " JobID:" + jobId + " myNodeId:" + myNodeIndex ); _slaveAddressList = slaveAddressList; _myNodeIndex = myNodeIndex; _conf = conf; _jobId = jobId; _iterationNumber = iterationNumber; _nodeCount = slaveAddressList.size(); // start event loop ... _eventLoop = new EventLoop(); _eventLoop.start(); try { createWriters(); } catch (IOException e) { _failed = true; LOG.error("Got Exception opening BlockWriters"); closeAllWriters(); throw e; } } /** * close the multiplexer, and optionally wait and flush all streams * @param forced - if false, block for all streams to complete * @return true if failure condition */ public boolean close(boolean forced) { // if not a forced close ... and we are not in a failure condition ... if (!forced && !_failed && _completionCount != _slaveAddressList.size()) { LOG.info("Setting up Poll Loop to monitor for clean shutdown"); // create a semaphore to block on final Semaphore blockingSemaphore = new Semaphore(0); // set up a poll loop to monitor writers ... _eventLoop.setTimer(new Timer(10,true,new Timer.Callback() { @Override public void timerFired(Timer timer) { if (_failed || _completionCount == _slaveAddressList.size()) { if (_failed) { LOG.error("Poll loop detected Failure. Shutting Down"); } else { LOG.info("Poll loop detected completion. Shutting Down"); } // release semaphore ... blockingSemaphore.release(); // cancel timer ... _eventLoop.cancelTimer(timer); } } })); // ok now wait for completion ... blockingSemaphore.acquireUninterruptibly(); } // a forced close is explicit, meaning just teardown everything ... if (forced) { _failed = true; } // ok close all writes closeAllWriters(); // ok finally shutdown the event loop _eventLoop.stop(); return _failed; } /** * write a page rank value to the appropriate stream ... * this method could block ... * @param target * @param source * @param prValue * @throws IOException */ public void writePRValue(int targetNode,URLFPV2 targetFP,URLFPV2 sourceFP,float prValue)throws IOException { if (_failed) { throw new IOException("Multiplexer in Failed State!"); } // figure out which stream this entry belongs to ... //int nodeIndex = (target.hashCode() & Integer.MAX_VALUE) % _nodeCount; // write directly to the proper block writer PRValueBlockWriter writer = null; synchronized (_activeWriters) { if (_activeWriters.size() != 0) writer = _activeWriters.get(targetNode); } if (writer != null) { writer.writePRValue(targetFP, sourceFP, prValue); } else { LOG.error("No Writer Found for nodexIndex:" + targetNode); } } /** * * create the block writers * @throws IOException */ void createWriters()throws IOException { int targetSlaveIndex = 0; for (InetSocketAddress targetSlaveAddress : _slaveAddressList) { LOG.info("Creating Writer for:" + targetSlaveAddress); PRValueBlockWriter prValueWriter = new PRValueBlockWriter( this, _conf, _jobId, targetSlaveAddress, targetSlaveIndex++, _myNodeIndex, _iterationNumber ); synchronized (_activeWriters) { _activeWriters.add(prValueWriter); } } } void writerFailed(final PRValueBlockWriter writer,final IOException reason) { LOG.info("Writer Failed Callback for writer:" + writer._targetSlaveAddress); _failed = true; // fail this in the context of the async thread _eventLoop.setTimer(new Timer(0,false,new Timer.Callback() { @Override public void timerFired(Timer timer) { LOG.error("Writer for Slave:"+ writer._targetSlaveAddress + " failed with exception:" + CCStringUtils.stringifyException(reason)); _failedWriter = writer; closeAllWriters(); } })); } void writerDone(final PRValueBlockWriter writer) { LOG.info("Writer:" + writer._targetSlaveAddress + " done"); synchronized (this) { _completionCount++; } writer.close(); } void closeAllWriters() { LOG.info("Multiplexer: Closing all Writers"); ImmutableList<PRValueBlockWriter> writers = null; synchronized (_activeWriters) { writers = new ImmutableList.Builder().addAll(_activeWriters).build(); } for (PRValueBlockWriter writer : writers) { writer.close(); } // clear list synchronized (_activeWriters) { _activeWriters.clear(); } } } /** * Individual Node PageRank Value Stream Writer * * @author rana * */ static class PRValueBlockWriter implements AsyncClientChannel.ConnectionCallback { PRValueMultiplexer _multiplexer; ByteBuffer _outputBuffer = null; byte[] _outputArray = null; LinkedBlockingQueue<ByteBuffer> _packetQueue = new LinkedBlockingQueue<ByteBuffer>(MAX_PACKETS_ENQUEUED); CRC32 _crc32 = new CRC32(); int _itemCount=0; LzoCodec _codec = new LzoCodec(); InetSocketAddress _targetSlaveAddress; int _targetSlaveIndex; int _sourceSlaveIndex; int _iterationNumber; String _targetFileName; FileInfo _fileInfo = new FileInfo(); long _lastBlockId = 0; // set when no more data is expected .. boolean _done = false; // slave communication related code ... AsyncClientChannel _channel; PageRankSlaveServer.AsyncStub _asyncStub; Semaphore _blockingCallSemaphore = null; IOException _lastIOException = null; String _logLinePrefix; private void log(boolean isError,String message) { if (isError) LOG.error(_logLinePrefix + message); else LOG.info(_logLinePrefix + message); } public PRValueBlockWriter(PRValueMultiplexer multiplexer, Configuration conf, long jobId, InetSocketAddress targetSlaveAddress, int targetSlaveIndex, int sourceSlaveIndex, int iterationNumber)throws IOException { _multiplexer = multiplexer; _outputBuffer = allocateNewBuffer(); _codec.setConf(conf); _targetSlaveAddress = targetSlaveAddress; _targetSlaveIndex = targetSlaveIndex; _sourceSlaveIndex = sourceSlaveIndex; _iterationNumber = iterationNumber; _logLinePrefix = "[TGT:" + targetSlaveIndex + " Addr:" + _targetSlaveAddress + "]"; _blockingCallSemaphore = new Semaphore(0); log(false,"Connecting to slave at index:" + _targetSlaveIndex + " endPoint:"+ _targetSlaveAddress); _channel = new AsyncClientChannel(_multiplexer._eventLoop,null,_targetSlaveAddress,this); _channel.open(); _asyncStub = new PageRankSlaveServer.AsyncStub(_channel); log(false,"Waiting on Connect... "); _blockingCallSemaphore.acquireUninterruptibly(); log(false,"Connect Semaphore Released... "); if (!_channel.isOpen()) { log(true,"Connection Failed!"); throw new IOException("Connection Failed!"); } _targetFileName = getOutlinksBaseName(_sourceSlaveIndex,_iterationNumber) + "-" + NUMBER_FORMAT.format(_targetSlaveIndex); _fileInfo.setFileName(_targetFileName); _fileInfo.setJobId(jobId); log(false,"Sending Open File Command For Target:" + _targetSlaveAddress); sendOpenFileCommand(); } /** * Enqueue a page-rank value into this stream * * @param target * @param source * @param prValue * @throws IOException */ public void writePRValue(URLFPV2 target,URLFPV2 source,float prValue)throws IOException { _outputBuffer.putLong(target.getDomainHash()); _outputBuffer.putLong(target.getUrlHash()); _outputBuffer.putLong(source.getRootDomainHash()); _outputBuffer.putLong(source.getDomainHash()); _outputBuffer.putLong(source.getUrlHash()); _outputBuffer.putFloat(prValue); if (++_itemCount == RECORDS_PER_BLOCK) { // flush flush(); } } /** * mark this stream as compelte */ public void done() { // mark the stream as complete ... _done = true; // the poll thread will } void queuePollEvent() { // start the poll timer ... _multiplexer._eventLoop.setTimer(new Timer(10, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { if (_lastIOException == null && _channel.isOpen()) { ByteBuffer nextPacket = _packetQueue.poll(); if (nextPacket != null) { // ok we got a packet ... send it log(false, "got packet via poll"); BlockTransfer tranfserRequest = new BlockTransfer(); tranfserRequest.setBlockData(new FlexBuffer(nextPacket.array(),0,nextPacket.limit())); tranfserRequest.setBlockId(_lastBlockId++); tranfserRequest.setFileId(_fileInfo.getFileId()); try { log(false,"Calling transferBlock RPC"); _asyncStub.transferBlock(tranfserRequest,new Callback<BlockTransfer, BlockTransferAck>() { @Override public void requestComplete( AsyncRequest<BlockTransfer, BlockTransferAck> request) { log(false,"transferBlock RPC Returned with Status:" + request.getStatus()); if (request.getStatus() == Status.Success) { // queue next poll event ... queuePollEvent(); } else { log(true, "transferBlock Failed!"); failed(new IOException("Transfer Block Failed!")); } } }); } catch (IOException e) { log(true, CCStringUtils.stringifyException(e)); // mark this stream as done ... failed(e); } } else { // check to see if we are done if (_done) { try { log(false,"Sending commitFile RPC"); _asyncStub.commitFile(_fileInfo, new Callback<FileInfo, NullMessage>() { @Override public void requestComplete(AsyncRequest<FileInfo, NullMessage> request) { log(false,"commitFile RPC returned with Status:" + request.getStatus()); if (request.getStatus() == Status.Success) { _multiplexer.writerDone(PRValueBlockWriter.this); } } }); } catch (IOException e) { log(true,CCStringUtils.stringifyException(e)); } } else { queuePollEvent(); } } } else { failed(null); } } })); } void sendOpenFileCommand()throws IOException { log(false,"sending createJobFile RPC"); _asyncStub.createJobFile(_fileInfo, new Callback<FileInfo, FileInfo>() { @Override public void requestComplete(AsyncRequest<FileInfo, FileInfo> request) { log(false,"createJobFile RPC returned with Status:" + request.getStatus()); if (request.getStatus() == Status.Success) { log(false,"Create File Successfull!!"); _fileInfo.setFileId(request.getOutput().getFileId()); // start polling log(false,"Polling for Data Packets"); queuePollEvent(); } else { // indicate a failure condition ... failed(new IOException("File Open Failed for Slave:" + _targetSlaveAddress )); } } }); } /** * indicate a failure condition * @param e */ private void failed(IOException e) { log(true,"failed called with Exception:" + CCStringUtils.stringifyException(e)); if (e != null) { _lastIOException = e; } // inform the multiplexer of the error ... _multiplexer.writerFailed(this,_lastIOException); } public void close() { log(false,"close called channel is:" + _channel + " packetQueue size is:" + _packetQueue.size()); if (_channel != null) { try { _channel.close(); } catch (IOException e) { e.printStackTrace(); } _channel = null; } // dump packets on the floor _packetQueue.clear(); } private ByteBuffer allocateNewBuffer() { return ByteBuffer.allocate(BLOCK_HEADER_SIZE + RECORD_BYTE_SIZE * RECORDS_PER_BLOCK + PADDING); } private static final int MAX_PACKETS_ENQUEUED = 5; private static final int RECORD_BYTE_SIZE = 48; // EACH RECORD IS 48 bytes long ... private static final int RECORDS_PER_BLOCK = (2 ^ 12); // 4096 records per block... private static final int SYNC_ESCAPE = -1; // "length" of sync entries private static final int SYNC_ESCAPE_SIZE = 4; // "length" of sync entries private static final byte SYNC_BYTES[] = { 'S','Y','N','C','B','Y','T','E' }; // sync bytes size ... private static final int BLOCK_SYNC_BYTE_SIZE = SYNC_ESCAPE_SIZE +SYNC_BYTES.length; // escape + hash; // block CRC LENGTH private static final int BLOCK_CRC_FIELD_SIZE = 8; // block LENGTH private static final int BLOCK_COMPRESSED_LENGTH_FIELD_SIZE = 4; // block LENGTH private static final int BLOCK_UNCOMPRESSED_LENGTH_FIELD_SIZE = 4; // PADDING FOR COMPRESSOR private static final int PADDING = 2 ^ 8; // block header size ... private static final int BLOCK_HEADER_SIZE = BLOCK_SYNC_BYTE_SIZE + BLOCK_CRC_FIELD_SIZE + BLOCK_COMPRESSED_LENGTH_FIELD_SIZE + BLOCK_UNCOMPRESSED_LENGTH_FIELD_SIZE; void flush()throws IOException { log(false,"flush called"); if (!_channel.isOpen() || _lastIOException != null) { log(true,"Invalid State. Connection Already Closed!"); throw new IOException("Connection Already Closed!"); } // queue packet for send ... if (_outputBuffer.position() != 0) { // create compressed buffer object .. ByteBuffer compressedBuffer = allocateNewBuffer(); // skip header ... compressedBuffer.position(BLOCK_HEADER_SIZE); // create output stream based on bytebuffer OutputStream compressedDataOutputStream = newOutputStream(compressedBuffer); // ok ... now compress the block CompressionOutputStream codecStream = _codec.createOutputStream(compressedDataOutputStream); // compress data .. codecStream.write(_outputBuffer.array(), 0, _outputBuffer.position()); // flush it ... codecStream.close(); // compute crc ... _crc32.reset(); _crc32.update(_outputBuffer.array(), 0, _outputBuffer.position()); // remember compressed buffer size .. int compressedBufferSize = compressedBuffer.position() - BLOCK_HEADER_SIZE; // ok write out header ... compressedBuffer.position(0); // write sync bytes into header ... compressedBuffer.putInt(SYNC_ESCAPE); // and write sync bytes compressedBuffer.put(SYNC_BYTES,0,SYNC_BYTES.length); // write crc ... compressedBuffer.putLong(_crc32.getValue()); // write compressed length and uncompressed length... compressedBuffer.putInt(compressedBufferSize); compressedBuffer.putInt(_outputBuffer.position()); // and put it in queue ... compressedBuffer.position(compressedBufferSize + BLOCK_HEADER_SIZE); // flip it .. compressedBuffer.flip(); log(false,"queueing packet. Item Count:" + _itemCount + " UncompressedSize:" + _outputBuffer.position() + " CompressedSize:" + compressedBuffer.limit()); // add it to queue try { _packetQueue.put(compressedBuffer); } catch (InterruptedException e) { } // get new output buffer ... _outputBuffer.position(0); // reset item count _itemCount = 0; } } private static OutputStream newOutputStream(final ByteBuffer buf) { return new OutputStream() { @Override public void write(int b) throws IOException { buf.put((byte) (b & 0xff)); } public void write(byte src[], int off, int len) throws IOException { buf.put(src, off, len); } }; } @Override public void OutgoingChannelConnected(AsyncClientChannel channel) { LOG.info("OutgoingChannelConnected... "); if (_blockingCallSemaphore != null) { _blockingCallSemaphore.release(); } } @Override public boolean OutgoingChannelDisconnected(AsyncClientChannel channel) { LOG.info("OutgoingChannelDisconnected... "); try { // explicitly close the channel! _channel.close(); } catch (IOException e) { } _lastIOException = new IOException("Disconnected from slave"); if (_blockingCallSemaphore != null) { _blockingCallSemaphore.release(); } else { failed(_lastIOException); } return false; } } /** * Helper Class that encapsulates Block Receiving Logic for Slave Servers * * @author rana * */ static class PRValueBlockFileReceiver { // the active job id private long _jobId; // the fully qualified job storage path ... private File _jobFileLocalPath; // immediate shutdown flag ... private boolean _immediateShutdown = false; /** * */ public PRValueBlockFileReceiver(long jobId,File jobFileLocalPath) { _jobId = jobId; _jobFileLocalPath = jobFileLocalPath; startBlockWriter(); } /** * shutdown the block receiver * either in an orderly manner or immediately * @param orderly in an orderly manner (complete queued requests) or immediately */ public void shutdown(boolean orderly) throws IOException { if (_blockWriter != null) { // create a shutdown request ... BlockRequest request = BlockRequest.shutdownRequest(); // put in queue _blockRequestQueue.add(request); // if immediate, indicate so _immediateShutdown = !orderly; // ok wait for thread to exit ... LOG.info("Waiting for BlockWriter Thread Shutdown"); try { _blockWriter.join(); } catch (InterruptedException e) { } LOG.info("BlockWriter Thread Exited"); // ok reset state ... _immediateShutdown = false; _blockWriter = null; } } File getActiveJobLocalPath() { return _jobFileLocalPath; } long getJobId() { return _jobId; } private static class BlockRequest<DataType extends RPCStruct,ResultType> { enum BlockRequestType { FILE_CREATE, BLOCK_WRITE, FILE_COMMIT, PURGE, SHUTDOWN } AsyncContext _context; CallbackWithResult<BlockRequest<DataType,ResultType>> _callback; DataType _data; BlockRequestType _type; ResultType _result; public static BlockRequest<FileInfo,Long> createFileRequest(AsyncContext context,FileInfo fileInfo,CallbackWithResult<BlockRequest<FileInfo,Long>> callback)throws IOException { return new BlockRequest<FileInfo,Long>(context,BlockRequestType.FILE_CREATE,fileInfo,callback,0L); } public static BlockRequest<FileInfo,Boolean> commitFileRequest(AsyncContext context,FileInfo fileInfo,CallbackWithResult<BlockRequest<FileInfo,Boolean>> callback)throws IOException { return new BlockRequest<FileInfo,Boolean>(context,BlockRequestType.FILE_COMMIT,fileInfo,callback,false); } public static BlockRequest<BlockTransfer,Boolean> blockTransferRequest(AsyncContext context,BlockTransfer blockInfo,CallbackWithResult<BlockRequest<BlockTransfer,Boolean>> callback)throws IOException { return new BlockRequest<BlockTransfer,Boolean>(context,BlockRequestType.BLOCK_WRITE,blockInfo,callback,false); } public static BlockRequest<NullMessage,Boolean> purgeRequest(AsyncContext context,NullMessage nullMessage,CallbackWithResult<BlockRequest<NullMessage,Boolean>> callback)throws IOException { return new BlockRequest<NullMessage,Boolean>(context,BlockRequestType.PURGE,null,callback,false); } public static BlockRequest<NullMessage,Boolean> shutdownRequest()throws IOException { return new BlockRequest<NullMessage,Boolean>(null,BlockRequestType.SHUTDOWN,null,null,false); } public BlockRequest(AsyncContext context,BlockRequestType type, DataType data,CallbackWithResult<BlockRequest<DataType,ResultType>> callback,ResultType defaultResultValue) throws IOException { _context = context; _type = type; _data = data; _callback = callback; _result = defaultResultValue; } } Thread _blockWriter = null; LinkedBlockingQueue<BlockRequest> _blockRequestQueue = new LinkedBlockingQueue<BlockRequest>(); long _lastFileId = 0; static class ActiveFile { ActiveFile(File file,RandomAccessFile stream,long fileId) { _file = file; _stream = stream; _fileId = fileId; } File _file; RandomAccessFile _stream; long _fileId; } TreeMap<Long,ActiveFile> _activeFilesMap = new TreeMap<Long,ActiveFile>(); void startBlockWriter() { _blockWriter = new Thread( new Runnable() { @SuppressWarnings("unchecked") @Override public void run() { LOG.info("BlockWriter Thread Running... "); try { while (true) { try { BlockRequest request = _blockRequestQueue.take(); if (_immediateShutdown || request._type == BlockRequest.BlockRequestType.PURGE || request._type == BlockRequest.BlockRequestType.SHUTDOWN) { LOG.info("Got Shutdown Or Purge Request... Closing existing connections"); purgeOpenFiles(); if (_immediateShutdown || request._type == BlockRequest.BlockRequestType.SHUTDOWN) { LOG.info("Received Shutdown Request. Existing Thread"); break; } } else { if (request._type == BlockRequest.BlockRequestType.FILE_CREATE) { BlockRequest<FileInfo, Long> typedRequest = (BlockRequest<FileInfo, Long>)request; LOG.info("Got Block File Create Request for Path:" + typedRequest._data.getFileName()); // create the actual file ... File basePath = getActiveJobLocalPath(); File path = new File(basePath,typedRequest._data.getFileName()); // try to create a file from scratch ... try { RandomAccessFile stream = new RandomAccessFile(path, "rw"); ActiveFile activeFile = new ActiveFile(path,stream,++_lastFileId); _activeFilesMap.put(activeFile._fileId, activeFile); typedRequest._result = activeFile._fileId; LOG.info("Created Block File at Path:" + path + " FileId:" + activeFile._fileId); // ok return to caller } catch (IOException e) { typedRequest._result = 0L; LOG.error("Error Creating Block File:" + path + ":" + CCStringUtils.stringifyException(e)); } finally { // initiate callback typedRequest._callback.execute(typedRequest); } } else if (request._type == BlockRequest.BlockRequestType.FILE_COMMIT) { BlockRequest<FileInfo, Boolean> typedRequest = (BlockRequest<FileInfo, Boolean>)request; LOG.info("Got Commit Request for FileId::" + typedRequest._data.getFileId()); // expect failure typedRequest._result = false; // try to access the file try { ActiveFile activeFile = _activeFilesMap.get(typedRequest._data.getFileId()); if (activeFile != null) { LOG.info("Committing File: " + activeFile._file + " Id:" + activeFile._fileId); if (activeFile._stream != null) { try { activeFile._stream.close(); typedRequest._result = true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } _activeFilesMap.remove(activeFile._fileId); } else { LOG.error("No Active File Found for Id:" + typedRequest._data.getFileId()); } } finally { // initiate callback typedRequest._callback.execute(typedRequest); } } else if (request._type == BlockRequest.BlockRequestType.BLOCK_WRITE) { BlockRequest<BlockTransfer, Boolean> typedRequest = (BlockRequest<BlockTransfer, Boolean>)request; LOG.info("Got Block Transfer Request for FileId:" + typedRequest._data.getFileId() + " ByteCount:" + typedRequest._data.getBlockData().getCount()); // expect failure typedRequest._result = false; // try to access the file try { ActiveFile activeFile = _activeFilesMap.get(typedRequest._data.getFileId()); if (activeFile != null) { LOG.info("Writing: " + typedRequest._data.getBlockData().getCount() + " Bytes to File: " + activeFile._file + " Id:" + activeFile._fileId); if (activeFile._stream != null) { try { activeFile._stream.write(typedRequest._data.getBlockData().getReadOnlyBytes(),0,typedRequest._data.getBlockData().getCount()); typedRequest._result = true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } _activeFilesMap.remove(activeFile._fileId); } else { LOG.error("No Active File Found for Id:" + typedRequest._data.getFileId()); } } finally { // initiate callback typedRequest._callback.execute(typedRequest); } } } } catch (InterruptedException e) { } } } finally { LOG.info("Block Writer Thread Exiting"); } } }); _blockWriter.start(); } void purgeOpenFiles() { for (ActiveFile file : _activeFilesMap.values()) { if (file._stream != null) { try { file._stream.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } file._file.delete(); } } _activeFilesMap.clear(); } public void createJobFile(final AsyncContext<FileInfo, FileInfo> rpcContext) throws RPCException { try { if (getJobId() != rpcContext.getInput().getJobId()) { throw new IOException ("Invalid Job Config or Invalid Job Id!"); } LOG.info("Got createJobFile RPC. Path:" + rpcContext.getInput().getFileName()); // default to failure status ... rpcContext.setStatus(Status.Error_RequestFailed); try { BlockRequest request = BlockRequest.createFileRequest( rpcContext, rpcContext.getInput(), new CallbackWithResult<BlockRequest<FileInfo,Long>>() { @Override public void execute(BlockRequest<FileInfo,Long> requestObject) { try { LOG.info("Received callback for createFile:" + requestObject._data.getFileName() + " Result:" + requestObject._result); // ok request was successfull ... if (requestObject._result != 0L) { // write was successfull ... rpcContext.getOutput().setFileId(rpcContext.getInput().getFileId()); rpcContext.getOutput().setFileId(requestObject._result); rpcContext.setStatus(Status.Success); } } finally { try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); _blockRequestQueue.put(request); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } catch (InterruptedException e) { } } catch (IOException e) { rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); LOG.error(rpcContext.getErrorDesc()); rpcContext.setStatus(Status.Error_RequestFailed); } finally { rpcContext.completeRequest(); } } public void transferBlock( final AsyncContext<BlockTransfer, BlockTransferAck> rpcContext) throws RPCException { LOG.info("Got trasferBlock RPC. FileId:" + rpcContext.getInput().getFileId() + " BufferSize:" + rpcContext.getInput().getBlockData().getCount()); try { BlockRequest request = BlockRequest.blockTransferRequest( rpcContext, rpcContext.getInput(), new CallbackWithResult<BlockRequest<BlockTransfer,Boolean>>() { @Override public void execute(BlockRequest<BlockTransfer, Boolean> requestObject) { try { // ok request was successfull ... if (requestObject._result == true) { // write was successfull ... rpcContext.getOutput().setFileId(rpcContext.getInput().getFileId()); rpcContext.getOutput().setBlockId(rpcContext.getInput().getBlockId()); rpcContext.setStatus(Status.Success); } else { rpcContext.setStatus(Status.Error_RequestFailed); } } finally { try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); _blockRequestQueue.put(request); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } catch (InterruptedException e) { } } public void commitFile(final AsyncContext<FileInfo, NullMessage> rpcContext) throws RPCException { LOG.info("Got commitFile RPC. FileId:" + rpcContext.getInput().getFileId() ); try { BlockRequest request = BlockRequest.commitFileRequest( rpcContext, rpcContext.getInput(), new CallbackWithResult<BlockRequest<FileInfo,Boolean>>() { @Override public void execute(BlockRequest<FileInfo, Boolean> requestObject) { try { // ok request was successfull ... if (requestObject._result == true) { // write was successfull ... rpcContext.setStatus(Status.Success); } else { rpcContext.setStatus(Status.Error_RequestFailed); } } finally { try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); _blockRequestQueue.put(request); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } catch (InterruptedException e) { } } } public static class PRValueBlockWriterAndReceiverTester extends Server implements PageRankSlave ,AsyncServerChannel.ConnectionCallback { AsyncServerChannel _channel; EventLoop _eventLoop; PRValueBlockFileReceiver _receiver; File _jobLocalPath; PRValueBlockWriterAndReceiverTester(EventLoop eventLoop, int instanceId,int portToUse) throws IOException { _eventLoop = eventLoop; _jobLocalPath = new File("/tmp/prvalue_receiver_test/" + instanceId); InetSocketAddress localAddress = new InetSocketAddress("localhost",0); InetSocketAddress address = new InetSocketAddress("localhost",portToUse); _channel = new AsyncServerChannel(this, _eventLoop, address,this); registerService(_channel,PageRankSlave.spec); FileUtils.recursivelyDeleteFile(_jobLocalPath); _jobLocalPath.mkdirs(); start(); // start the block receiver.... _receiver = new PRValueBlockFileReceiver(1,_jobLocalPath); } void shutdown() { LOG.info("Doing orderly shutdown on receiver"); try { _receiver.shutdown(true); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } LOG.info("Closing Channel"); stop(); } public static void runTest() { EventLoop eventLoop = new EventLoop(); eventLoop.start(); try { // instantiate tester ... LOG.info("Starting Servers"); PRValueBlockWriterAndReceiverTester tester1 = new PRValueBlockWriterAndReceiverTester(eventLoop,0,9000); PRValueBlockWriterAndReceiverTester tester2 = new PRValueBlockWriterAndReceiverTester(eventLoop,1,9001); PRValueBlockWriterAndReceiverTester tester3 = new PRValueBlockWriterAndReceiverTester(eventLoop,2,9002); Vector<InetSocketAddress> addressList = new Vector<InetSocketAddress>(); addressList.add(new InetSocketAddress("127.0.0.1",9000)); addressList.add(new InetSocketAddress("127.0.0.1",9001)); addressList.add(new InetSocketAddress("127.0.0.1",9002)); Configuration conf = new Configuration(); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); LOG.info("Creating Multiplexer"); // instantiate block writer ... PRValueMultiplexer multiplexer = new PRValueMultiplexer(conf, 1, 0, addressList, 0); URLFPV2 source = URLUtils.getURLFPV2FromURL("http://source.com/"); URLFPV2 dest = URLUtils.getURLFPV2FromURL("http://dest.com/"); LOG.info("Writing Values"); for (int i=0;i<10000;++i) { multiplexer.writePRValue(i % 3, source, dest, 1.0f); } LOG.info("Waiting on Close"); multiplexer.close(false); // shutdown writers LOG.info("Shutting Down Receiver 1"); tester1.shutdown(); LOG.info("Shutting Down Receiver 2"); tester2.shutdown(); LOG.info("Shutting Down Receiver 3"); tester3.shutdown(); eventLoop.stop(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public void beginPageRank( AsyncContext<BeginPageRankInfo, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void checkpoint(AsyncContext<CheckpointInfo, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void commitFile(AsyncContext<FileInfo, NullMessage> rpcContext) throws RPCException { LOG.info("TestServer: Recevied commitFile Cmd"); _receiver.commitFile(rpcContext); } @Override public void createJobFile(AsyncContext<FileInfo, FileInfo> rpcContext) throws RPCException { LOG.info("TestServer: Recevied createJobFile Cmd"); _receiver.createJobFile(rpcContext); } @Override public void deleteFile(AsyncContext<FileInfo, NullMessage> rpcContext) throws RPCException { } @Override public void doIteration(AsyncContext<IterationInfo, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void endPageRank(AsyncContext<NullMessage, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void heartbeat(AsyncContext<NullMessage, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void initialize(AsyncContext<BaseConfig, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void transferBlock( AsyncContext<BlockTransfer, BlockTransferAck> rpcContext) throws RPCException { LOG.info("TestServer: Recevied transferBlock Cmd"); _receiver.transferBlock(rpcContext); } @Override public void IncomingClientConnected(AsyncClientChannel channel) { LOG.info("TestServer IncomingClient Connected"); } @Override public void IncomingClientDisconnected(AsyncClientChannel channel) { LOG.info("TestServer IncomingClient Disconnected"); } } private static FileSystem buildDistributionOutputStreamVector(boolean useSequenceFile,String fileNamePrefix,File localOutputPath,String remoteOutputPath, int myNodeIndex, int nodeCount,Vector<PRValueOutputStream> outputStreamVector) { Configuration conf = new Configuration(CrawlEnvironment.getHadoopConfig()); conf.setInt("dfs.socket.timeout",240000); conf.setInt("io.file.buffer.size", 4096 * 20); DistributedFileSystem hdfs = new DistributedFileSystem(); try { hdfs.initialize(FileSystem.getDefaultUri(conf), conf); for (int i=0;i<nodeCount;++i) { // create output filename String fileName = fileNamePrefix + "-" + NUMBER_FORMAT.format(i); // create stream (local or remote stream, depending on i) // remote path Path remotePath = new Path(remoteOutputPath,fileName); // remove file CrawlEnvironment.getDefaultFileSystem().delete(remotePath,false); if (useSequenceFile) { // recreate it ... outputStreamVector.add(new PRSequenceFileOutputStream(conf,CrawlEnvironment.getDefaultFileSystem(),remotePath)); } else { // recreate it ... outputStreamVector.add(new PROldValueOutputStream(CrawlEnvironment.getDefaultFileSystem(),remotePath)); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); for (PRValueOutputStream streamInfo : outputStreamVector) { try { if (streamInfo != null) { streamInfo.close(true); } } catch (IOException e2) { LOG.error(CCStringUtils.stringifyException(e2)); } outputStreamVector.clear(); } } return hdfs; } public static Vector<Path> buildCalculationInputStreamVector(File localOutputPath,String remoteOutputPath, int myNodeIndex, int nodeCount, int iterationNumber) { Vector<Path> vector = new Vector<Path>(); for (int i=0;i<nodeCount;++i) { // create output filename String fileName = getOutlinksBaseName(i,iterationNumber) + "-" + NUMBER_FORMAT.format(myNodeIndex); // create stream (local or remote stream, depending on i) // remote path Path remotePath = new Path(remoteOutputPath,fileName); LOG.info("Adding Path:" + remotePath + " For Index:" + i); // vector.add(remotePath); } return vector; } public static class SourceAndRank implements Comparable<SourceAndRank> { SourceAndRank(URLFPV2 fingerprint,float prValue) { source.setDomainHash(fingerprint.getDomainHash()); source.setRootDomainHash(fingerprint.getRootDomainHash()); source.setUrlHash(fingerprint.getUrlHash()); rank = prValue; } URLFPV2 source = new URLFPV2(); float rank; @Override public int compareTo(SourceAndRank o) { return source.compareTo(o.source); } } public static class DomainHashAndPRValue implements Comparable<DomainHashAndPRValue> { public DomainHashAndPRValue(long domainHash,float prValue) { _domainHash = domainHash; _accumulator = prValue; _inputs = 1; } public void updatePRValue(float newPRValue) { _accumulator += newPRValue; _inputs++; } public float averageValue() { return _accumulator / (float)_inputs; } public long _domainHash; public float _accumulator; public int _inputs; @Override public int compareTo(DomainHashAndPRValue o) { return ((Long)_domainHash).compareTo(o._domainHash); } } public static class RootDomain { public RootDomain() { } public HashMap<Long,DomainHashAndPRValue> subDomains = new HashMap<Long,DomainHashAndPRValue>(); } public static class TargetAndSources { URLFPV2 target = new URLFPV2(); HashMap<Long,RootDomain> sources = new HashMap<Long,RootDomain>(); } public static class TargetSourceAndRank { public boolean readFromStream(DataInputStream inputStream) throws IOException { if (inputStream.available() != 0) { target.setDomainHash(inputStream.readLong()); target.setUrlHash(inputStream.readLong()); source.setRootDomainHash(inputStream.readLong()); source.setDomainHash(inputStream.readLong()); source.setUrlHash(inputStream.readLong()); prValue = inputStream.readFloat(); isValid = true; } else { isValid = false; } return isValid; } @Override public String toString() { return "Target DomainHash:" + target.getDomainHash() + " FP:" + target.getUrlHash() + " Source DomainHash:" + source.getDomainHash() + " FP:" + source.getUrlHash(); } boolean isValid = false; URLFPV2 target = new URLFPV2(); URLFPV2 source = new URLFPV2(); float prValue; } static interface PRInputSource { public TargetSourceAndRank next() throws IOException; public TargetSourceAndRank last(); public void close() throws IOException; public long getSize() throws IOException; } static class PRSequenceFileInputSource implements PRInputSource { SequenceFile.Reader _reader; public Path _path; public TargetSourceAndRank _currentValue = new TargetSourceAndRank(); DataInputBuffer _inputStream = new DataInputBuffer(); FlexBuffer _buffer = new FlexBuffer(); long _totalLength = 0; public PRSequenceFileInputSource(Configuration conf,FileSystem fs,Path path,SortedPRInputReader reader)throws IOException { _path = path; _reader = new SequenceFile.Reader(fs, path, conf); FileStatus fileStatus = fs.getFileStatus(_path); _totalLength = 0L; if (fileStatus != null) { _totalLength = fileStatus.getLen(); } } @Override public void close() throws IOException { if (_reader != null) { _reader.close(); _reader = null; } } @Override public TargetSourceAndRank last(){ return _currentValue; } @Override public TargetSourceAndRank next() throws IOException { _currentValue = null; if (_reader.next(_buffer, NullWritable.get())) { _inputStream.reset(_buffer.get(), _buffer.getCount()); _currentValue = new TargetSourceAndRank(); _currentValue.readFromStream(_inputStream); } return _currentValue; } @Override public long getSize() throws IOException { return _totalLength; } } static class PROldInputSource implements PRInputSource { SortedPRInputReader _reader = null; long _bytesTotal; public PROldInputSource(Path path,SortedPRInputReader reader) throws IOException { _path = path; _istream = CrawlEnvironment.getDefaultFileSystem().open(_path); _bytesTotal = CrawlEnvironment.getDefaultFileSystem().getFileStatus(_path).getLen(); // wrap the stream so that we can monitor progress ... _istream = new FilterInputStream(_istream) { @Override public int read() throws IOException { _reader._totalBytesRead += 1; return this.in.read(); } @Override public int read(byte[] b, int off, int len) throws IOException { int bytesRead = this.in.read(b, off, len); _reader._totalBytesRead += bytesRead; return bytesRead; } @Override public long skip(long n) throws IOException { long bytesSkipped = this.in.skip(n); _reader._totalBytesRead += bytesSkipped; return bytesSkipped; } }; _stream = new DataInputStream(_istream); _reader = reader; } @Override public TargetSourceAndRank next() throws IOException { _currentValue = null; if (_stream != null && _stream.available() != 0) { _currentValue = new TargetSourceAndRank(); // reset bytes read counter _currentValue.readFromStream(_stream); } return _currentValue; } @Override public TargetSourceAndRank last() { return _currentValue; } @Override public void close() throws IOException { if (_istream != null) { _istream.close(); _istream = null; _stream = null; } } public Path _path; public InputStream _istream; public DataInputStream _stream; public TargetSourceAndRank _currentValue = new TargetSourceAndRank(); @Override public long getSize() throws IOException { return _bytesTotal; } } public static class SortedPRInputReader { PRInputSource _inputs[] = null; int _validStreams = 0; long _totalBytesToRead = 0; long _totalBytesRead = 0; public SortedPRInputReader(Configuration conf,FileSystem fs,Vector<Path> streams,boolean useSequenceFile) throws IOException { try { LOG.info("PRInputReader: Allocating Stream Array of Size:" + streams.size()); //ok allocate an array up to stream vector size ... _inputs = new PRInputSource[streams.size()]; // now, open streams for (Path streamInfo : streams) { if (!useSequenceFile) { _inputs[_validStreams] = new PROldInputSource(streamInfo,this); } else { _inputs[_validStreams] = new PRSequenceFileInputSource(conf,fs,streamInfo,this); } // advance to first item if (_inputs[_validStreams].next() == null) { LOG.error("PRInputReader: Stream At Index:" + _validStreams + " contains zero entries!"); _inputs[_validStreams].close(); } else { LOG.info("PRInputReader: Stream :" + _validStreams + " First Item:" + _inputs[_validStreams].last().toString() ); _totalBytesToRead += _inputs[_validStreams].getSize(); _validStreams++; } } // lastly sort streams sortStreams(); LOG.info("Sorted First Item:" + _inputs[0].last().toString()); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); close(); throw e; } } void close() { for (int i=0;i<_validStreams;++i) { try { _inputs[i].close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } _inputs[i] = null; } _validStreams = 0; } static final int MAX_ROOT_DOMAIN_SOURCES_PER_TARGET = 100000; static final int MAX_SUBDOMAIN_SOURCES_PER_ROOTDOMAIN = 500; static DomainHashAndPRValue addSourceToTarget(TargetAndSources tgtAndSources,TargetSourceAndRank source) { RootDomain rootDomain = tgtAndSources.sources.get(source.source.getRootDomainHash()); if (rootDomain == null) { if (tgtAndSources.sources.size() < MAX_ROOT_DOMAIN_SOURCES_PER_TARGET) { rootDomain = new RootDomain(); tgtAndSources.sources.put(source.source.getRootDomainHash(), rootDomain); } } DomainHashAndPRValue hashAndPRValue = (rootDomain != null) ? rootDomain.subDomains.get(source.source.getDomainHash()) : null; if (hashAndPRValue == null) { hashAndPRValue = new DomainHashAndPRValue(source.source.getDomainHash(), source.prValue); if (rootDomain != null && rootDomain.subDomains.size() < MAX_SUBDOMAIN_SOURCES_PER_ROOTDOMAIN) { rootDomain.subDomains.put(source.source.getDomainHash(),hashAndPRValue); } } else { hashAndPRValue.updatePRValue(source.prValue); } return hashAndPRValue; } void sortStreams() { Arrays.sort(_inputs,0,_validStreams,new Comparator<PRInputSource>() { @Override public int compare(PRInputSource o1, PRInputSource o2) { return o1.last().target.compareTo(o2.last().target); } }); } // collect next valid target and all related sources TargetAndSources readNextTarget() throws IOException { if (_validStreams != 0) { TargetAndSources target = new TargetAndSources(); target.target.setDomainHash(_inputs[0].last().target.getDomainHash()); target.target.setUrlHash(_inputs[0].last().target.getUrlHash()); //LOG.info("readNextTarget - target is:" + target.target.getDomainHash() + ":" + target.target.getUrlHash()); //LOG.info("readNextTarget - source is:" + _inputs[0].last().source.getDomainHash() + ":" + _inputs[0].last().source.getUrlHash()); DomainHashAndPRValue lastValue = addSourceToTarget(target,_inputs[0].last()); // advance input zero _inputs[0].next(); // ok enter a loop and collect all sources for current target ... for (int streamIdx=0;streamIdx<_validStreams;) { if (_inputs[streamIdx].last() == null || _inputs[streamIdx].last().target.compareTo(target.target) != 0) { streamIdx++; } else { if (lastValue != null && lastValue._domainHash == _inputs[streamIdx].last().source.getDomainHash()) { lastValue.updatePRValue(_inputs[streamIdx].last().prValue); } else { lastValue = addSourceToTarget(target,_inputs[streamIdx].last()); } // advance current stream ... _inputs[streamIdx].next(); } } // ok now collect remaining valid streams int newValidStreamCount=0; for (int currStreamIdx=0;currStreamIdx<_validStreams;++currStreamIdx) { if (_inputs[currStreamIdx].last() != null) { _inputs[newValidStreamCount++] = _inputs[currStreamIdx]; } else { // close the stream ... _inputs[currStreamIdx].close(); // null it out ... _inputs[currStreamIdx] = null; } } // resset valid stream count _validStreams = newValidStreamCount; // ok now sort streams ... if (_validStreams != 0) { sortStreams(); } return target; } else { return null; } } } public static class CalculateRankQueueItem { public CalculateRankQueueItem(TargetAndSources next) { _e = null; _next = next; } public CalculateRankQueueItem(IOException e) { _e = e; _next = null; } public CalculateRankQueueItem() { _e = null; _next = null; } public TargetAndSources _next; public IOException _e; } public static void calculateRank(final Configuration conf,final FileSystem fs,final PRValueMap valueMap, final File jobLocalDir,final String jobWorkPath,final int nodeIndex, final int slaveCount, final int iterationNumber,final SuperDomainFilter superDomainFilter,final ProgressAndCancelCheckCallback progressAndCancelCallback) throws IOException { final LinkedBlockingQueue<CalculateRankQueueItem> readAheadQueue = new LinkedBlockingQueue<CalculateRankQueueItem>(20); // build stream vector ... Vector<Path> streamVector = buildCalculationInputStreamVector(jobLocalDir,jobWorkPath,nodeIndex,slaveCount,iterationNumber); // construct a reader ... final SortedPRInputReader reader = new SortedPRInputReader(conf,fs,streamVector,true); Thread readerThread = new Thread(new Runnable() { @Override public void run() { IOException exceptionOut = null; try { TargetAndSources target = null; while ((target = reader.readNextTarget()) != null) { try { readAheadQueue.put(new CalculateRankQueueItem(target)); } catch (InterruptedException e) { } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); exceptionOut = e; } finally { if (reader != null) { reader.close(); } } try { readAheadQueue.put(new CalculateRankQueueItem(exceptionOut)); } catch (InterruptedException e1) { } } }); readerThread.start(); int failedUpdates = 0; int totalUpdates = 0; long iterationStart = System.currentTimeMillis(); boolean cancelled = false; while (!cancelled) { CalculateRankQueueItem queueItem = null; try { queueItem = readAheadQueue.take(); } catch (InterruptedException e) { } if (queueItem._next != null) { totalUpdates++; //LOG.info("Target: DomainHash:" + target.target.getDomainHash() + " URLHash:" + target.target.getUrlHash() + " ShardIdx:" + ((target.target.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.PR_NUMSLAVES)); // now accumulate rank from stream into value map if (!accumulateRank(valueMap,queueItem._next,superDomainFilter)) { failedUpdates++; LOG.error("**TotalUpdates:" + totalUpdates + " Failed Updates:" + failedUpdates); } if ((totalUpdates + failedUpdates) % 10000 == 0) { float percentComplete = (float) reader._totalBytesRead / (float)reader._totalBytesToRead; if (progressAndCancelCallback != null) { cancelled = progressAndCancelCallback.updateProgress(percentComplete); if (cancelled) { LOG.info("Cancel check callback returned true"); } } long timeEnd = System.currentTimeMillis(); int milliseconds = (int)(timeEnd - iterationStart); //LOG.info("Accumulate PR for 10000 Items Took:" + milliseconds + " Milliseconds QueueSize:" + readAheadQueue.size()); iterationStart = System.currentTimeMillis(); } } else { if (queueItem._e != null) { LOG.error(CCStringUtils.stringifyException(queueItem._e)); throw queueItem._e; } else { // now finally pagerank value in value map ... valueMap.finalizePageRank(); } break; } } try { readerThread.join(); } catch (InterruptedException e) { } } private static boolean accumulateRank(PRValueMap valueMap, TargetAndSources target,SuperDomainFilter superDomainFilter) throws IOException { float rank = 0.0f; //LOG.info("Accumulating Rank for DomainHash:" + target.target.getDomainHash() + " URLFP:" + target.target.getUrlHash()); for (Map.Entry<Long, RootDomain> entry : target.sources.entrySet()) { // ok first figure out if this is a super domain boolean rootIsSuperDomain = (superDomainFilter != null && superDomainFilter.filterItemByHashIdV2(entry.getKey()) == FilterResult.Filter_Accept); RootDomain rootDomain = entry.getValue(); if (!rootIsSuperDomain) { float accumulator = 0.0f; /* if (rootDomain.subDomains.size() > 1) { LOG.info("Non-Super-Domain:" + entry.getKey() + " has " + rootDomain.subDomains.size() + " subdomains"); } */ int subDomainsIterated = 0; for (DomainHashAndPRValue source : rootDomain.subDomains.values()) { ++subDomainsIterated; /* if (rootDomain.subDomains.size() > 1) { LOG.info("Taking Max Between CurrentValue:" + maxSourceValue + " and current SubDomain:" + source._prValue); } */ accumulator += source.averageValue(); if (subDomainsIterated > 100) break; } if (subDomainsIterated != 0) { rank += accumulator / (float) subDomainsIterated; } } else { /* if (rootDomain.subDomains.size() > 1) { LOG.info("Super-Domain:" + entry.getKey() + " has " + rootDomain.subDomains.size() + " subdomains"); } */ // ok walk items in collection (which are sorted by domain id) for (DomainHashAndPRValue source : rootDomain.subDomains.values()) { /* if (rootDomain.subDomains.size() > 1) { LOG.info("Adding SubDomain:" + source._domainHash + " value:" + source._prValue + " to existing value:" + rank); } */ rank += source.averageValue(); } } } try { // update page rank for item in map valueMap.addPRValue(target.target, rank); return true; } catch (IOException e) { return false; } } private static class OutlinkItem { public OutlinkItem() { targetFingerprint = new URLFPV2(); sourceFingerprint = new URLFPV2(); } public OutlinkItem(IOException e) { error = e; } public URLFPV2 targetFingerprint = null; public URLFPV2 sourceFingerprint = null; public int urlCount = 0; public IOException error = null; } public interface ProgressAndCancelCheckCallback { boolean updateProgress(float percentComplete); } public static void distributeRank(final PRValueMap valueMap,final Path outlinksFile,final boolean outlinksIsRemote,File localOutputDir,String remoteOutputDir,int thisNodeIdx,int nodeCount,int iterationNumber,final ProgressAndCancelCheckCallback progressCallback)throws IOException { final Configuration conf = CrawlEnvironment.getHadoopConfig(); Vector<PRValueOutputStream> outputStreamVector = new Vector<PRValueOutputStream>(); // allocate a queue ... final LinkedBlockingQueue<OutlinkItem> queue = new LinkedBlockingQueue<OutlinkItem>(20000); try { // start the loader thread ... Thread loaderThread = new Thread( new Runnable() { final BytesWritable key= new BytesWritable(); final BytesWritable value = new BytesWritable(); final DataInputBuffer keyStream = new DataInputBuffer(); final DataInputBuffer valueStream = new DataInputBuffer(); @Override public void run() { LOG.info("Opening Outlinks File at:" + outlinksFile); SequenceFile.Reader reader = null; try { FileSystem fsForOutlinksFile = null; if (outlinksIsRemote) { fsForOutlinksFile = CrawlEnvironment.getDefaultFileSystem(); } else { fsForOutlinksFile = FileSystem.getLocal(conf); } FileStatus outlinksFileStatus = fsForOutlinksFile.getFileStatus(outlinksFile); long bytesToReadTotal = (outlinksFileStatus != null) ? outlinksFileStatus.getLen() : 0; reader = new SequenceFile.Reader(fsForOutlinksFile,outlinksFile,conf); OutlinkItem item = new OutlinkItem(); int itemCount = 0; boolean isCancelled = false; while (!isCancelled && reader.next(key,value)) { keyStream.reset(key.getBytes(),0,key.getLength()); valueStream.reset(value.getBytes(),0,value.getLength()); //populate item from data readURLFPFromStream(keyStream, item.targetFingerprint); item.urlCount = readURLFPAndCountFromStream(valueStream, item.sourceFingerprint); try { long blockTimeStart = System.currentTimeMillis(); queue.put(item); long blockTimeEnd = System.currentTimeMillis(); } catch (InterruptedException e) { } item = new OutlinkItem(); if (itemCount++ %10000 == 0 && progressCallback != null) { float percentComplete = (float)reader.getPosition() / (float)bytesToReadTotal; if (progressCallback.updateProgress(percentComplete)) { LOG.info("Cancel check callback returned true.Cancelling outlink item load"); isCancelled = true; } } } item.sourceFingerprint = null; item.targetFingerprint = null; // add empty item try { if (!isCancelled) { queue.put(item); } else { queue.put(new OutlinkItem(new IOException("Operation Cancelled"))); } } catch (InterruptedException e) { } } catch (IOException e) { // add error item to queue. try { queue.put(new OutlinkItem(e)); } catch (InterruptedException e1) { } } finally { if (reader != null) try { reader.close(); } catch (IOException e) { } } } }); loaderThread.start(); // first things first ... initialize output stream vector FileSystem fileSystem = buildDistributionOutputStreamVector(true,getOutlinksBaseName(thisNodeIdx,iterationNumber),localOutputDir,remoteOutputDir,thisNodeIdx,nodeCount,outputStreamVector); try { // open outlinks file . LOG.info("Iterating Items in Outlinks File and Writing Test Value"); int itemCount = 0; int totalOutlinkCount = 0; int iterationOutlinkCount = 0; long iterationStart = System.currentTimeMillis(); long timeStart = iterationStart; boolean done = false; ArrayList<OutlinkItem> items = new ArrayList<OutlinkItem>(); // start iterating outlinks while(!done) { //OutlinkItem item = null; //try { long waitTimeStart = System.currentTimeMillis(); queue.drainTo(items); long waitTimeEnd = System.currentTimeMillis(); //} catch (InterruptedException e) { //} for (OutlinkItem item : items) { if (item.error != null) { LOG.info("Loader Thread Returned Error:" + CCStringUtils.stringifyException(item.error)); throw item.error; } else if (item.sourceFingerprint == null) { LOG.info("Loader Thread Indicated EOF via emtpy item"); done = true; } else { ++itemCount; /* LOG.info("SourceFP-DomainHash:" + item.sourceFingerprint.getDomainHash() + " URLHash:" + item.sourceFingerprint.getUrlHash() + " PartitionIdx:" + ((item.sourceFingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.PR_NUMSLAVES) ); */ // now get pr value for fingerprint (random seek in memory here!!!) float prValue = valueMap.getPRValue(item.sourceFingerprint) / (float) Math.max(item.urlCount,1); // write value out int nodeIndex = (item.targetFingerprint.hashCode() & Integer.MAX_VALUE) % nodeCount; outputStreamVector.get(nodeIndex).writePRValue(item.targetFingerprint,item.sourceFingerprint,prValue); if (itemCount % 10000 == 0) { long timeEnd = System.currentTimeMillis(); int milliseconds = (int)(timeEnd - iterationStart); LOG.info("Distribute PR for 10000 Items with:" + iterationOutlinkCount + " Outlinks Took:" + milliseconds + " Milliseconds" + " QueueCount:" + queue.size() ); iterationStart = System.currentTimeMillis(); totalOutlinkCount += iterationOutlinkCount; iterationOutlinkCount = 0; } } } items.clear(); } totalOutlinkCount += iterationOutlinkCount; LOG.info("Distribute Finished for a total of:" + itemCount + " Items with:" + totalOutlinkCount + " Outlinks Took:" + (System.currentTimeMillis() - timeStart) + " Milliseconds" ); LOG.info("Waiting for Loader Thread to Die"); try { loaderThread.join(); } catch (InterruptedException e) { } LOG.info("Loader Thread Died - Moving on..."); } finally { for (PRValueOutputStream info : outputStreamVector) { if (info != null) { info.close(false); } } if (fileSystem != null) { fileSystem.close(); } } } catch (IOException e) { LOG.error("Exception caught while distributing outlinks:" + CCStringUtils.stringifyException(e)); throw e; } } @Test public void testname() throws Exception { int array[] = { 2,3,5,7,10 }; System.out.println("searching for 1 returned:" + findPos(array,1)); System.out.println("searching for 2 returned:" + findPos(array,2)); System.out.println("searching for 11 returned:" + findPos(array,11)); System.out.println("searching for 8 returned:" + findPos(array,8)); } public static void main(String[] args) { LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn02:9000/"); if (args[0].equals("PRValueRW")) { runPRValueReadWriteTest(args); } else if (args[0].equals("IDRead")) { runIDReadBenchmark(args); } else if (args[0].equals("DRank")) { runDistributeRankBenchmark(args); } else if (args[0].equals("ARank")) { runAccumulateRankBechmark(args); } else if (args[0].equals("BlockFileRcv")) { LOG.info("Running BlockFileReceiver test"); runBlockFileReceiverTest(); } } private static void runBlockFileReceiverTest() { PRValueBlockWriterAndReceiverTester.runTest(); } private static void runIDReadBenchmark(String[] args) { File idsFile = new File(args[1]); URLFPV2 fingerPrint = new URLFPV2(); LOG.info("Opening ID File at path:" + idsFile.getAbsolutePath()); RandomAccessFile stream = null; try { stream = new RandomAccessFile(idsFile,"r"); long length = stream.length(); int idCount = 0; long totalStartTime = System.currentTimeMillis(); long snapshotTime = System.currentTimeMillis(); boolean error = false; while (!error) { fingerPrint.readFields(stream); ++idCount; if (idCount % 10000 == 0) { LOG.info("Read 10000 ids in:" + (System.currentTimeMillis() - snapshotTime) + " MS"); snapshotTime = System.currentTimeMillis(); } } LOG.info("Completed Reading a Total of:" + idCount + " IDs in:" + (System.currentTimeMillis() - totalStartTime) + " MS"); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { } } } } private static void runPRValueReadWriteTest(String[] args) { Configuration conf = CrawlEnvironment.getHadoopConfig(); File valueFile = new File(args[1]); File rangeFile = new File(args[2]); File outlinksFile = new File(args[3]); PRValueMap valueMap = new PRValueMap(); try { valueMap.open(FileSystem.getLocal(conf),new Path(valueFile.getAbsolutePath()), new Path(rangeFile.getAbsolutePath())); // valueMap.dumpRangeItems(); LOG.info("Opening Outlinks File at:" + outlinksFile); SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(conf),new Path(outlinksFile.getPath()),conf); LOG.info("Iterating Items in Outlinks File and Writing Test Value"); URLFPV2 fingerprint = new URLFPV2(); CompressedOutlinkList outlinkList = new CompressedOutlinkList(); int itemCount = 0; long valueWriteStart = System.currentTimeMillis(); long timeStart = valueWriteStart; while (reader.next(fingerprint,outlinkList)) { ++itemCount; // LOG.info("Got Item with Domain Hash:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash()); // get pr value for item ... //TODO: SWITCH TO INT FOR TEST // valueMap.setPRValue(fingerprint,itemCount % Short.MAX_VALUE); valueMap.setPRValue(fingerprint,itemCount % Integer.MAX_VALUE); // LOG.info("Get PRValue returned:" + prValue); fingerprint.clear(); outlinkList.clear(); if (itemCount % 10000 == 0) { LOG.info("Wrote 10000 Items in:" + (System.currentTimeMillis() - timeStart) + " Milliseconds"); timeStart = System.currentTimeMillis(); } } LOG.info("Done Writing Values. Took:" + (System.currentTimeMillis() - valueWriteStart) + " Milliseconds"); valueFile.delete(); OutputStream stream = null; try { stream = new FileOutputStream(valueFile); // flush stuff to disk valueMap.flush(stream); } finally { if (stream != null) stream.close(); } LOG.info("Opening Outlinks File at:" + args[2]); reader = new SequenceFile.Reader(FileSystem.getLocal(conf),new Path(args[2]),conf); LOG.info("Iterating Items in Outlinks File and Reading Test Value"); itemCount = 0; long valueReadStart = System.currentTimeMillis(); timeStart = valueWriteStart; while (reader.next(fingerprint,outlinkList)) { ++itemCount; // LOG.info("Got Item with Domain Hash:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash()); // get pr value for item ... float prValue = valueMap.getPRValue(fingerprint); // LOG.info("Get PRValue returned:" + prValue); //TODO: SWITCH TO INT FOR TEST //if (prValue != (itemCount % Short.MAX_VALUE)) { if (prValue != (itemCount % Integer.MAX_VALUE)) { //TODO: SWITCH TO INT FOR TEST // throw new IOException("PRValue did not match for item:" + itemCount + " Expected:" + (itemCount % Short.MAX_VALUE) + " Got:" + prValue); throw new IOException("PRValue did not match for item:" + itemCount + " Expected:" + (itemCount % Integer.MAX_VALUE) + " Got:" + prValue); } fingerprint.clear(); outlinkList.clear(); if (itemCount % 10000 == 0) { LOG.info("Read 10000 Items in:" + (System.currentTimeMillis() - timeStart) + " Milliseconds"); timeStart = System.currentTimeMillis(); } } LOG.info("Done Reading Values. Took:" + (System.currentTimeMillis() - valueReadStart) + " Milliseconds"); valueMap.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } private static void runAccumulateRankBechmark(String args[]){ LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("mapred-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); try { Path valueFile = new Path(args[1]); Path rangeFile = new Path(args[2]); Path outlinksFile = new Path(args[3]); //Path outputDir = new Path(args[4]); String remoteOutputDir = args[4]; LOG.info("ValuesFile:" + valueFile); LOG.info("RangeFile:" + rangeFile); LOG.info("OutlinksFile:" + outlinksFile); LOG.info("RemoteOutputDir:" + remoteOutputDir); LOG.info("Initializing SuperDomain Filter"); SuperDomainFilter superDomainFilter = new SuperDomainFilter(); superDomainFilter.loadFromPath(new InetSocketAddress("10.0.20.21",CrawlEnvironment.DIRECTORY_SERVICE_RPC_PORT).getAddress(),CrawlEnvironment.ROOT_SUPER_DOMAIN_PATH, false); LOG.info("Loaded SuperDomain Filter"); int thisNodeIdx = 0; int totalNodeCount = CrawlEnvironment.PR_NUMSLAVES; FileSystem fs = FileSystem.get(conf); PRValueMap valueMap = new PRValueMap(); LOG.info("Initializing Value Map"); valueMap.open(FileSystem.get(conf),valueFile, rangeFile); LOG.info("Initialized Value Map"); LOG.info("Calculating Rank"); long timeStart = System.currentTimeMillis(); calculateRank(conf,fs,valueMap, null, remoteOutputDir, 0, totalNodeCount, 0, superDomainFilter, null); long timeEnd = System.currentTimeMillis(); LOG.info("Done Calculating Rank. Took:" + (timeEnd-timeStart)); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } public static final int readURLFPAndCountFromStream(DataInput input,URLFPV2 fpOut)throws IOException { fpOut.setDomainHash(input.readLong()); fpOut.setRootDomainHash(input.readLong()); fpOut.setUrlHash(input.readLong()); return WritableUtils.readVInt(input); } public static final void writeURLFPAndCountToStream(DataOutput stream,URLFPV2 key,int urlCount)throws IOException { stream.writeLong(key.getDomainHash()); stream.writeLong(key.getRootDomainHash()); stream.writeLong(key.getUrlHash()); WritableUtils.writeVInt(stream, urlCount); } public static final void readURLFPFromStream(DataInput input,URLFPV2 fpOut)throws IOException { fpOut.setDomainHash(input.readLong()); fpOut.setRootDomainHash(input.readLong()); fpOut.setUrlHash(input.readLong()); } public static final void writeURLFPToStream(DataOutput stream,URLFPV2 key)throws IOException { stream.writeLong(key.getDomainHash()); stream.writeLong(key.getRootDomainHash()); stream.writeLong(key.getUrlHash()); } private static void runDistributeRankBenchmark(String args[]){ LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("mapred-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); try { Path valueFile = new Path(args[1]); Path rangeFile = new Path(args[2]); Path outlinksFile = new Path(args[3]); //Path outputDir = new Path(args[4]); String remoteOutputDir = args[4]; LOG.info("ValuesFile:" + valueFile); LOG.info("RangeFile:" + rangeFile); LOG.info("OutlinksFile:" + outlinksFile); LOG.info("RemoteOutputDir:" + remoteOutputDir); int thisNodeIdx = 0; int totalNodeCount = CrawlEnvironment.PR_NUMSLAVES; FileSystem fs = FileSystem.get(conf); PRValueMap valueMap = new PRValueMap(); valueMap.open(FileSystem.get(conf),valueFile, rangeFile); fs.mkdirs(new Path(remoteOutputDir)); fs.delete(new Path(remoteOutputDir,"*"),false); //File localOutputFile = new File(localOutputDir,getOutlinksBaseName(0,0) + "-" + NUMBER_FORMAT.format(0)); //localOutputFile.delete(); distributeRank(valueMap,outlinksFile,true, null,remoteOutputDir, thisNodeIdx, totalNodeCount,0,null); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }