/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools.arc;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.zip.GZIPInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
/**
* <p>The <code>ArchRecordReader</code> class provides a record reader which
* reads records from arc files.</p>
*
* <p>Arc files are essentially tars of gzips. Each record in an arc file is
* a compressed gzip. Multiple records are concatenated together to form a
* complete arc. For more information on the arc file format see
* {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.</p>
*
* <p>Arc files are used by the internet archive and grub projects.</p>
*
* @see http://www.archive.org/
* @see http://www.grub.org/
*/
public class ArcRecordReader
implements RecordReader<Text, BytesWritable> {
public static final Log LOG = LogFactory.getLog(ArcRecordReader.class);
protected Configuration conf;
protected long splitStart = 0;
protected long pos = 0;
protected long splitEnd = 0;
protected long splitLen = 0;
protected long fileLen = 0;
protected FSDataInputStream in;
private static byte[] MAGIC = {(byte)0x1F, (byte)0x8B};
/**
* <p>Returns true if the byte array passed matches the gzip header magic
* number.</p>
*
* @param input The byte array to check.
*
* @return True if the byte array matches the gzip header magic number.
*/
public static boolean isMagic(byte[] input) {
// check for null and incorrect length
if (input == null || input.length != MAGIC.length) {
return false;
}
// check byte by byte
for (int i = 0; i < MAGIC.length; i++) {
if (MAGIC[i] != input[i]) {
return false;
}
}
// must match
return true;
}
/**
* Constructor that sets the configuration and file split.
*
* @param conf The job configuration.
* @param split The file split to read from.
*
* @throws IOException If an IO error occurs while initializing file split.
*/
public ArcRecordReader(Configuration conf, FileSplit split)
throws IOException {
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fileLen = fs.getFileStatus(split.getPath()).getLen();
this.conf = conf;
this.in = fs.open(split.getPath());
this.splitStart = split.getStart();
this.splitEnd = splitStart + split.getLength();
this.splitLen = split.getLength();
in.seek(splitStart);
}
/**
* Closes the record reader resources.
*/
public void close()
throws IOException {
this.in.close();
}
/**
* Creates a new instance of the <code>Text</code> object for the key.
*/
public Text createKey() {
return (Text)ReflectionUtils.newInstance(Text.class, conf);
}
/**
* Creates a new instance of the <code>BytesWritable</code> object for the key
*/
public BytesWritable createValue() {
return (BytesWritable)ReflectionUtils.newInstance(BytesWritable.class, conf);
}
/**
* Returns the current position in the file.
*
* @return The long of the current position in the file.
*/
public long getPos()
throws IOException {
return in.getPos();
}
/**
* Returns the percentage of progress in processing the file. This will be
* represented as a float from 0 to 1 with 1 being 100% completed.
*
* @return The percentage of progress as a float from 0 to 1.
*/
public float getProgress()
throws IOException {
// if we haven't even started
if (splitEnd == splitStart) {
return 0.0f;
}
else {
// the progress is current pos - where we started / length of the split
return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen);
}
}
/**
* <p>Returns true if the next record in the split is read into the key and
* value pair. The key will be the arc record header and the values will be
* the raw content bytes of the arc record.</p>
*
* @param key The record key
* @param value The record value
*
* @return True if the next record is read.
*
* @throws IOException If an error occurs while reading the record value.
*/
public boolean next(Text key, BytesWritable value)
throws IOException {
try {
// get the starting position on the input stream
long startRead = in.getPos();
byte[] magicBuffer = null;
// we need this loop to handle false positives in reading of gzip records
while (true) {
// while we haven't passed the end of the split
if (startRead >= splitEnd) {
return false;
}
// scanning for the gzip header
boolean foundStart = false;
while (!foundStart) {
// start at the current file position and scan for 1K at time, break
// if there is no more to read
startRead = in.getPos();
magicBuffer = new byte[1024];
int read = in.read(magicBuffer);
if (read < 0) {
break;
}
// scan the byte array for the gzip header magic number. This happens
// byte by byte
for (int i = 0; i < read - 1; i++) {
byte[] testMagic = new byte[2];
System.arraycopy(magicBuffer, i, testMagic, 0, 2);
if (isMagic(testMagic)) {
// set the next start to the current gzip header
startRead += i;
foundStart = true;
break;
}
}
}
// seek to the start of the gzip header
in.seek(startRead);
ByteArrayOutputStream baos = null;
int totalRead = 0;
try {
// read 4K of the gzip at a time putting into a byte array
byte[] buffer = new byte[4096];
GZIPInputStream zin = new GZIPInputStream(in);
int gzipRead = -1;
baos = new ByteArrayOutputStream();
while ((gzipRead = zin.read(buffer, 0, buffer.length)) != -1) {
baos.write(buffer, 0, gzipRead);
totalRead += gzipRead;
}
}
catch (Exception e) {
// there are times we get false positives where the gzip header exists
// but it is not an actual gzip record, so we ignore it and start
// over seeking
System.out.println("Ignoring position: " + (startRead));
if (startRead + 1 < fileLen) {
in.seek(startRead + 1);
}
continue;
}
// change the output stream to a byte array
byte[] content = baos.toByteArray();
// the first line of the raw content in arc files is the header
int eol = 0;
for (int i = 0; i < content.length; i++) {
if (i > 0 && content[i] == '\n') {
eol = i;
break;
}
}
// create the header and the raw content minus the header
String header = new String(content, 0, eol).trim();
byte[] raw = new byte[(content.length - eol) - 1];
System.arraycopy(content, eol + 1, raw, 0, raw.length);
// populate key and values with the header and raw content.
Text keyText = (Text)key;
keyText.set(header);
BytesWritable valueBytes = (BytesWritable)value;
valueBytes.set(raw, 0, raw.length);
// TODO: It would be best to start at the end of the gzip read but
// the bytes read in gzip don't match raw bytes in the file so we
// overshoot the next header. With this current method you get
// some false positives but don't miss records.
if (startRead + 1 < fileLen) {
in.seek(startRead + 1);
}
// populated the record, now return
return true;
}
}
catch (Exception e) {
LOG.equals(StringUtils.stringifyException(e));
}
// couldn't populate the record or there is no next record to read
return false;
}
}