/*******************************************************************************
* Copyright 2013
* TU Darmstadt, FG Sprachtechnologie
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.bigdata.io.hadoop;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CountingInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.jwat.arc.ArcHeader;
import org.jwat.arc.ArcReader;
import org.jwat.arc.ArcReaderFactory;
import org.jwat.arc.ArcRecordBase;
import org.jwat.common.HeaderLine;
import org.jwat.common.HttpHeader;
import org.jwat.common.PayloadWithHeaderAbstract;
//import de.uni_leipzig.asv.encodingdetector.utils.EncodingDetector;
/**
* Creates ARCRecordReader for Crawler archives in ARC format
*
* @author Johannes Simon
*/
public class ARCInputFormat
extends FileInputFormat<Text, CrawlerRecord>
{
@Override
public RecordReader<Text, CrawlerRecord> getRecordReader(InputSplit inputSplit,
JobConf jobConf, Reporter reporter)
throws IOException
{
return new ARCRecordReader((FileSplit) inputSplit, jobConf);
}
/**
* Reads Crawler archives in ARC format
*
* @author Johannes Simon
*/
public static class ARCRecordReader
implements RecordReader<Text, CrawlerRecord>
{
private final long start;
private final long end;
private final CountingInputStream fsin;
ArcReader arcReader = null;
long lastRecordEnd = -1;
private final Set<String> contentTypeWhitelist = new HashSet<String>();
Configuration conf;
/*
* ======================== RecordReader Logic ============================
*/
/**
* Parses and sets configuration parameters according to a JobConf/Configuration instance
*/
void configure(Configuration conf)
{
String contentTypeWhitelistStr = conf.get(
"dkpro.input.content-type-whitelist", "text/html");
if (contentTypeWhitelistStr != null) {
String[] contentTypes = contentTypeWhitelistStr.replace(" ", "").split(",");
for (String ct : contentTypes) {
contentTypeWhitelist.add(ct);
}
}
this.conf = conf;
}
/**
* Creates an ARCRecordReader for the specified <code>split</code> of the input stream that
* will start at the first valid ARC record header after <code>split.getStart()</code> and
* continue until a record is read that goes past
* <code>split.getStart() + split.getLength()</code>.
*/
public ARCRecordReader(FileSplit split, JobConf jobConf)
throws IOException
{
start = split.getStart();
end = start + split.getLength();
System.out.println("========== " + start + " " + end);
configure(jobConf);
// Open the file and seek to the start of the split
Path file = split.getPath();
FileSystem fs = file.getFileSystem(jobConf);
fsin = new CountingInputStream(new BufferedInputStream(fs.open(split.getPath())));
arcReader = ArcReaderFactory.getReader(fsin);
// Start with the first valid record after offset "start"
skipToNextRecord(start);
}
private void fillCrawlerRecord(ArcRecordBase record, CrawlerRecord crawlerRecord)
throws IOException
{
// Fill CrawlerRecord
ArcHeader header = record.header;
// h.warcSegmentOriginIdUrl
crawlerRecord.setURL(header.urlStr);
// Usually ARC records contain the original webpage, including markup
crawlerRecord.setIsHTML(true);
crawlerRecord.setDate(header.archiveDate);
// Usually not present in ARC records
crawlerRecord.setOriginalLanguage(null);
// It is important to *not* try to decode anything before we know the encoding.
// As soon as we put any of this record content into a string, it is explicitly
// converted
// from/to some encoding. So before that, we have to guess the correct encoding,
// otherwise
// we'll potentially trash our data before we process it.
byte[] buffer = IOUtils.toByteArray(record.getPayloadContent());
Class<?> encodingDetectorClass = conf.getClass("dkpro.input.encodingdetector", DummyEncodingDetector.class);
try {
EncodingDetector encodingDetector = (EncodingDetector)encodingDetectorClass.newInstance();
// This is the encoding we'll use to decode the bytes into text
String encoding = encodingDetector.getBestEncoding(buffer);
crawlerRecord.setOriginalEncoding(encoding);
crawlerRecord.setContent(new String(buffer, encoding));
} catch (InstantiationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public boolean next(Text key, CrawlerRecord value)
throws IOException
{
ArcRecordBase arcRecord = null;
boolean atEnd = false;
long bufferMarkAtEnd = 0;
while ((arcRecord = arcReader.getNextRecord()) != null) {
// Check if arcReader has definitely read over end mark when considering that reader
// is buffered
// (meaning that (fsin.getCount() > end) is true before we've read the last record
// before end)
lastRecordEnd = fsin.getCount();
if (!atEnd && lastRecordEnd >= end) {
atEnd = true;
bufferMarkAtEnd = lastRecordEnd;
}
else if (atEnd && lastRecordEnd > bufferMarkAtEnd) {
break;
}
try {
// Make sure only text content is read
PayloadWithHeaderAbstract payloadHeader = arcRecord.getPayload()
.getPayloadHeaderWrapped();
if (payloadHeader == null || !(payloadHeader instanceof HttpHeader)) {
continue;
}
HeaderLine contentTypeHeader = payloadHeader.getHeader("Content-Type");
boolean skipContentType = true;
if (contentTypeHeader != null) {
for (String contentType : contentTypeWhitelist) {
if (contentTypeHeader.value.startsWith(contentType)) {
skipContentType = false;
break;
}
}
}
if (skipContentType) {
continue;
}
fillCrawlerRecord(arcRecord, value);
key.set(value.getURL());
return true;
}
catch (UnsupportedEncodingException e) {
// Skip unreadable records
System.err.println("WARNING: Skipping ARC record (byte offset "
+ fsin.getCount()
+ ") due to unsupported encoding. The record may contain binary data.");
}
catch (Exception e) {
// Skip any other records that produce exceptions
System.err
.println("WARNING: Skipping ARC record due to exception that occured while reading record:");
System.err.println(e.getMessage());
e.printStackTrace();
}
}
return false;
}
@Override
public Text createKey()
{
return new Text();
}
@Override
public CrawlerRecord createValue()
{
return new CrawlerRecord();
}
@Override
public long getPos()
throws IOException
{
return fsin.getCount();
}
@Override
public void close()
throws IOException
{
fsin.close();
}
@Override
public float getProgress()
throws IOException
{
return ((float) (fsin.getCount() - start)) / ((float) (end - start));
}
/*
* ======================== ARC Logic ============================
*/
/**
* Skips InputStream to next record past <code>start</code> or not at all if start is
* exactly the start of a record
*/
private void skipToNextRecord(long start)
throws IOException
{
// Skip record by record until (position in input stream) >= start
while (fsin.getCount() < start && arcReader.getNextRecord() != null) {
}
;
lastRecordEnd = fsin.getCount();
}
}
}