/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.hadoop.io;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.ArcFileReader;
/**
* A Hadooop {@link RecordReader} for reading {@link ARCSplit}s.
*
* @author Albert Chern
*/
public class ARCSplitReader implements RecordReader<Text, ArcFileItem> {
private class IOThread extends Thread {
@Override
public void run() {
for (int i = 0; i < readers.length; i++) {
ARCResource resource = split.getResources()[i];
ArcFileReader reader = readers[i];
InputStream stream = null;
long streamPosition = 0;
int failures = 0;
Throwable lastError = null;
while (true) {
try {
stream = source.getStream(resource.getName(), streamPosition, lastError, failures);
if (stream == null) {
// The ARCSource is telling us to fail
LOG.fatal("IOThread exiting", lastError);
error = lastError;
return;
}
ReadableByteChannel channel = Channels.newChannel(stream);
while (true) {
// Just keep passing buffers to the reader. The
// reader is supposed to limit the number of buffers
// it will accept and block when it is at the limit.
ByteBuffer buffer = ByteBuffer.allocate(blockSize);
int bytesRead = channel.read(buffer);
if (bytesRead > 0) {
streamPosition += bytesRead;
totalBytesRead += bytesRead;
buffer.flip();
reader.available(buffer);
} else if (bytesRead == -1) {
// Create the next reader before closing this
// one so that we don't get a race condition
// where the other thread tries to access the
// next reader before it is instantiated.
if (i + 1 < readers.length) {
readers[i + 1] = new ArcFileReader();
}
reader.finished();
break;
}
}
break;
} catch (Throwable t1) {
lastError = t1;
failures++;
} finally {
try {
if (stream != null) {
stream.close();
}
} catch (Throwable t2) {
}
stream = null;
}
}
}
}
}
private static final Log LOG = LogFactory.getLog(ARCSplitReader.class);
public static final String SPLIT_DETAILS = "arc.reader.split.details";
private ARCSplit split;
private ARCSource source;
private int blockSize;
private ArcFileReader[] readers;
private int readerIndex;
private long totalBytesRead;
private Throwable error;
/**
* Creates a new <tt>ARCSplitReader</tt>.
*
* @param split
* the {@link ARCSplit} to read
* @param source
* the {@link ARCSource} to open resources with
* @param blockSize
* the number of bytes at a time to read from each input stream
*/
public ARCSplitReader(JobConf job, ARCSplit split, ARCSource source, int blockSize) {
this.split = split;
// record split details in job config for debugging purposes ...
job.set(SPLIT_DETAILS, split.toString());
this.source = source;
this.blockSize = blockSize;
this.readers = new ArcFileReader[split.getResources().length];
this.readers[0] = new ArcFileReader();
this.readerIndex = 0;
this.totalBytesRead = 0;
this.error = null;
new IOThread().start();
}
/**
* @inheritDoc
*/
public void close() throws IOException {
}
/**
* @inheritDoc
*/
public Text createKey() {
return new Text();
}
/**
* @inheritDoc
*/
public ArcFileItem createValue() {
return new ArcFileItem();
}
/**
* @inheritDoc
*/
public long getPos() throws IOException {
return totalBytesRead;
}
/**
* @inheritDoc
*/
public float getProgress() throws IOException {
return totalBytesRead / (float) split.getLength();
}
/**
* Gets the next {@link ArcFileItem} from the split.
*
* @param item
* the {@link ArcFileItem} to populate
*
* @return <tt>true</tt> if a new item was read, or <tt>false</tt> if there
* are no more
*
* @throws IOException
* if an IO error or timeout occurs
*/
public boolean next(ArcFileItem item) throws IOException {
while (readerIndex < readers.length) {
if (readers[readerIndex].hasMoreItems()) {
// populate arc file path in item
item.setArcFileName(split.getResources()[readerIndex].getName());
try {
// and then delegate to reader instance
readers[readerIndex].getNextItem(item);
} catch (IOException e) {
LOG.error("IOException in ARCSplitReader.next().ArcFile:" + item.getArcFileName() + "\nException:"
+ StringUtils.stringifyException(e));
throw e;
} catch (Exception e) {
LOG.error("Unknown Exception thrown in ARCSplitReader.next().ArcFile:" + item.getArcFileName()
+ "\nException:" + StringUtils.stringifyException(e));
throw new RuntimeException(e);
}
return true;
} else {
readers[readerIndex++] = null;
}
}
return false;
}
/**
* @inheritDoc
*/
public boolean next(Text key, ArcFileItem value) throws IOException {
if (next(value)) {
// set uri from given key ...
key.set(value.getUri());
// TODO: we are going to clear the uri field's dirty flag since it is
// redundant
// this means the uri will not be serialized with the rest of the item's
// data...
value.setFieldClean(ArcFileItem.Field_URI);
return true;
}
return false;
}
}