/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.File; import java.io.IOException; import java.util.concurrent.Semaphore; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.commoncrawl.async.Callback; import org.commoncrawl.async.EventLoop; import org.commoncrawl.async.Timer; import org.commoncrawl.common.Environment; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import com.hadoop.compression.lzo.LzoCodec; /** used to push stats to hdfs in a consistent manner * stats are written to a local temp file in form of a * hadoop sequence file, and are flushed to hdfs when * the writer is closed. Each stat entry consists of a * key and a value pair. The key has to implement WritableComparable, * and value has to implement Writable * @author rana * */ public class MapReduceJobStatsWriter<KeyType extends WritableComparable,ValueType extends Writable > { private static final Log LOG = LogFactory.getLog(MapReduceJobStatsWriter.class); /** log family type **/ private String _logFamily; /** grouping key **/ private String _groupingKey; /** log file key **/ private long _uniqueKey; /** the temp file stats writer object **/ private SequenceFile.Writer _writer = null; /** remote file system instance **/ FileSystem _remoteFileSystem; /** temp file name **/ private File _tempFileName; /** output stream sequence file is writing to **/ private FSDataOutputStream _outputStream; /** hadoop config **/ Configuration _config; /** last log write exception **/ private IOException _lastLogWriteException; /** log file entry count **/ private int _entryCount = 0; /** internal class used to queue up log file write requests **/ private static class LogFileItem<KeyType extends WritableComparable,ValueType extends Writable> { LogFileItem(KeyType key,ValueType value) { _key = key; _value = value; } LogFileItem() { _key = null; _value = null; } public KeyType _key; public ValueType _value; } /** the log writer thread event loop **/ EventLoop _eventLoop = new EventLoop(); /** Constructor * * @param keyClass key type * @param valueClass value type * @param familyKey * @param groupingKey * @param uniqueKey */ public MapReduceJobStatsWriter(FileSystem remoteFileSystem, Configuration config, Class<KeyType> keyClass, Class<ValueType> valueClass, String familyKey, String groupingKey, long uniqueKey) throws IOException { _logFamily = familyKey; _groupingKey = groupingKey; _uniqueKey = uniqueKey; _remoteFileSystem = remoteFileSystem; _config = config; // temp file _tempFileName = File.createTempFile("statsWriter", "seq"); // create output stream that sequence file writer will output to _outputStream = FileSystem.getLocal(_config).create(new Path(_tempFileName.getAbsolutePath())); LzoCodec codec = new LzoCodec(); // create sequencefile writer _writer = SequenceFile.createWriter(config,_outputStream,keyClass,valueClass,CompressionType.BLOCK,codec); // start event loop _eventLoop.start(); } /** append an item to the log file **/ public void appendLogEntry(final KeyType key,final ValueType value) throws IOException { if (_lastLogWriteException == null) { // send async message to the writer thread ... _eventLoop.setTimer(new Timer(0,false,new Timer.Callback() { @Override public void timerFired(Timer timer) { // this executes in the writer thread's context ... try { _writer.append(key, value); ++_entryCount; } catch (IOException e) { LOG.error("Failed to Write to Log File Entry for:" + _logFamily +"/" + _groupingKey +"/" + Long.toString(_uniqueKey) + "Exception:" + CCStringUtils.stringifyException(e)); _lastLogWriteException = e; } } })); } else { IOException e = _lastLogWriteException; _lastLogWriteException = null; throw e; } } /** close and flush the log file **/ public void close(final Callback optionalAsyncCallback){ if (_eventLoop != null) { // allocate a blocking semaphore in case async callback was not specified final Semaphore blockingCallSemaphore = new Semaphore(0); // perform shutdown in worker thread ... _eventLoop.setTimer(new Timer(0,false,new Timer.Callback() { @Override public void timerFired(Timer timer) { try { try { if (_writer != null) { _writer.close(); } } catch (IOException e){ LOG.error(CCStringUtils.stringifyException(e)); _lastLogWriteException = e; } finally { _writer = null; try { if (_outputStream != null) { _outputStream.flush(); _outputStream.close(); } } catch (IOException e){ LOG.error(CCStringUtils.stringifyException(e)); _lastLogWriteException = e; } finally { _outputStream = null; } } // now figure out if everything went smoothly or not if (_entryCount != 0 && _lastLogWriteException == null) { // ok so far so good... time to copy the local log file to hdfs ... Path hdfsPath = new Path(Environment.HDFS_LOGCOLLECTOR_BASEDIR,_logFamily+"/"+_groupingKey+"/" + Long.toString(_uniqueKey)); try { // delete the remote file if it exists _remoteFileSystem.delete(hdfsPath,false); // ensure parent path _remoteFileSystem.mkdirs(hdfsPath.getParent()); // now if the local file exists and has data if (_tempFileName.exists() && _tempFileName.length() != 0) { // copy the file to hdfs _remoteFileSystem.copyFromLocalFile(new Path(_tempFileName.getAbsolutePath()), hdfsPath); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); _lastLogWriteException = e; } } } finally { // always delete the temp file ... _tempFileName.delete(); // release semaphore blockingCallSemaphore.release(); // if callback was specified , call it now if (optionalAsyncCallback != null) { optionalAsyncCallback.execute(); } // stop the event loop ... _eventLoop.stop(); _eventLoop = null; } } })); // now if callback was not specified... wait for blocking semaphore to signal ... if (optionalAsyncCallback == null) { blockingCallSemaphore.acquireUninterruptibly(); } } } public static void main(String[] args) { LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("hadoop-default.xml"); conf.addResource("hadoop-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); // test the stats Writer ... try { LOG.info("Opening Stats Writer"); MapReduceJobStatsWriter<IntWritable,Text> statsWriter = new MapReduceJobStatsWriter<IntWritable,Text>( CrawlEnvironment.getDefaultFileSystem(), conf, IntWritable.class,Text.class, "test","group1",12345L); LOG.info("Writing Entries"); for (int i=0;i<1000;++i){ statsWriter.appendLogEntry(new IntWritable(i), new Text("Log Entry #" + i)); } LOG.info("Flushing / Closing"); final Semaphore blockingSempahore = new Semaphore(0); statsWriter.close(new Callback() { @Override public void execute() { LOG.info("Completion Callback Triggered"); blockingSempahore.release(); } }); LOG.info("Waiting on Semaphore"); blockingSempahore.acquireUninterruptibly(); LOG.info("Acquired Semaphore"); LOG.info("Closed"); Path hdfsPath = new Path(Environment.HDFS_LOGCOLLECTOR_BASEDIR,"test"+"/"+"group1"+"/" + Long.toString(12345L)); LOG.info("Opening Reader"); SequenceFile.Reader reader = new SequenceFile.Reader(CrawlEnvironment.getDefaultFileSystem(),hdfsPath,conf); IntWritable key = new IntWritable(); Text value = new Text(); while (reader.next(key, value)) { LOG.info("Key:" + key.get() + " Value:" + value.toString()); } reader.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }