/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.hadoop.mergeutils; import java.io.IOException; import java.lang.reflect.Constructor; import java.util.Vector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.util.StringUtils; /** * An input source that reads from a SequenceFile * * @author rana * * @param <KeyType> * @param <ValueType> */ public class SequenceFileReader<KeyType extends WritableComparable, ValueType extends Writable> { public static final Log LOG = LogFactory.getLog(SequenceFileReader.class); FileSystem _sourceFileSystem; Configuration _config; SpillWriter<KeyType, ValueType> _writer = null; Vector<Path> _inputSegments = null; Constructor<KeyType> _keyConstructor = null; Constructor<ValueType> _valConstructor = null; long _recordCount = 0; private static final Class[] emptyArray = new Class[] {}; public SequenceFileReader(FileSystem fileSystem, Configuration conf, Vector<Path> inputSegments, SpillWriter<KeyType, ValueType> spillWriter, Class<KeyType> keyClass, Class<ValueType> valueClass) throws IOException { _sourceFileSystem = fileSystem; _config = conf; _inputSegments = inputSegments; _writer = spillWriter; try { this._keyConstructor = keyClass.getDeclaredConstructor(emptyArray); this._keyConstructor.setAccessible(true); this._valConstructor = valueClass.getDeclaredConstructor(emptyArray); this._valConstructor.setAccessible(true); } catch (SecurityException e) { LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } catch (NoSuchMethodException e) { LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } } public void close() throws IOException { } @SuppressWarnings("unchecked") public void readAndSpill() throws IOException { long cumilativeReadTimeStart = System.currentTimeMillis(); for (Path sequenceFilePath : _inputSegments) { long individualReadTimeStart = System.currentTimeMillis(); // LOG.info("Reading Contents for File:" + sequenceFilePath); // allocate a reader for the current path SequenceFile.Reader reader = new SequenceFile.Reader(_sourceFileSystem, sequenceFilePath, _config); try { boolean eos = false; while (!eos) { KeyType key = null; ValueType value = null; try { key = _keyConstructor.newInstance(); value = _valConstructor.newInstance(); } catch (Exception e) { LOG.error("Failed to create key or value type with Exception:" + StringUtils.stringifyException(e)); throw new RuntimeException(e); } eos = !reader.next(key, value); if (!eos) { _recordCount++; _writer.spillRecord(key, value); } } while (!eos) ; // LOG.info("Read and Spill of File:" + sequenceFilePath +" took:" + // (System.currentTimeMillis() - individualReadTimeStart)); } finally { if (reader != null) { reader.close(); } } } // LOG.info("Cumilative Read and Spill took:" + (System.currentTimeMillis() // - cumilativeReadTimeStart) + " Spilled RecordCount:" + _recordCount); } }