WARCInputFormat.java example

Explorer

dkpro-bigdata-master
- dkpro-bigdata-collocations
  - src
    - main
      - java
        org
        dkpro
        bigdata
        collocations
        AssocReducer.java
        AssociationMetrics.java
        CollocCombiner.java
        CollocDriver.java
        CollocMapper.java
        CollocReducer.java
        Gram.java
        GramKey.java
        GramKeyGroupComparator.java
        GramKeyPartitioner.java
- dkpro-bigdata-examples
  - src
    - main
      - java
        org
        dkpro
        bigdata
        examples
        CasConsumerExample.java
        ExternalDataExample.java
        Text2CASExample.java
        UimaPipelineOnHadoop.java
- dkpro-bigdata-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        hadoop
        AnalysisEngineUtil.java
        DkproHadoopDriver.java
        DkproMapper.java
        DkproReducer.java
        EngineFactory.java
        FeatureCountHadoopDriver.java
        UIMAMapReduceBase.java
        XMLDescriptorRunner.java
    - test
      - java
        org
        dkpro
        bigdata
        hadoop
        CasConsumerOutputTest.java
        DkproHadoopDriverTest.java
- dkpro-bigdata-io-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormat.java
        BinCasWithTypeSystemWritable.java
        BinCasWritable.java
        CASWritable.java
        CASWritableSequenceFileWriter.java
        CollectionReaderWrapper.java
        CrawlerRecord.java
        DummyEncodingDetector.java
        EncodingDetector.java
        FormatConverterMapper.java
        GenericKeyValueLineRecordReader.java
        GenericMultiLineRecordReader.java
        HdfsResourceLoaderLocator.java
        LeipzigInputFormat.java
        MultiLineText2CASInputFormat.java
        Text2CASInputFormat.java
        WARCInputFormat.java
        XCASSequenceFileWriter.java
        XmiSequenceFileWriter.java
    - test
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormatTest.java
        BinCasWithTypeSystemWritableTest.java
        BinCasWritableTest.java
        CASWritableTest.java
        HdfsResourceLoaderLocatorTest.java
        InputFormatTest.java
        LeipzigInputFormatTest.java
        WARCInputFormatTest.java

/*******************************************************************************
 * Copyright 2013
 * TU Darmstadt, FG Sprachtechnologie
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

package org.dkpro.bigdata.io.hadoop;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CountingInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.jwat.warc.WarcConstants;
import org.jwat.warc.WarcHeader;
import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;

//import de.uni_leipzig.asv.encodingdetector.utils.EncodingDetector;

/**
 * Creates WARCRecordReader for Crawler archives in WARC format
 * 
 * @author Johannes Simon
 * 
 */
public class WARCInputFormat
    extends FileInputFormat<Text, CrawlerRecord>
{
    @Override
    public RecordReader<Text, CrawlerRecord> getRecordReader(InputSplit inputSplit,
            JobConf jobConf, Reporter reporter)
        throws IOException
    {
        return new WARCRecordReader((FileSplit) inputSplit, jobConf);
    }

    /**
     * Reads Crawler archives in WARC format
     * 
     * @author Johannes Simon
     * 
     */
    public static class WARCRecordReader
        implements RecordReader<Text, CrawlerRecord>
    {
        private final long start;
        private final long end;
        private final CountingInputStream fsin;
        WarcReader warcReader = null;
        long lastRecordEnd = -1;
        private JobConf conf = null;

        private final Set<String> contentTypeWhitelist = new HashSet<String>();

        /*
         * ======================== RecordReader Logic ============================
         */

        /**
         * Parses and sets configuration parameters according to a JobConf/Configuration instance
         */
        void configure(Configuration conf)
        {
            String contentTypeWhitelistStr = conf.get(
                    "dkpro.input.content-type-whitelist", "text/html");
            if (contentTypeWhitelistStr != null) {
                String[] contentTypes = contentTypeWhitelistStr.replace(" ", "").split(",");
                for (String ct : contentTypes)
                    contentTypeWhitelist.add(ct);
            }
        }

        /**
         * Creates an ARCRecordReader for the specified <code>split</code> of the input stream that
         * will start at the first valid ARC record header after <code>split.getStart()</code> and
         * continue until a record is read that goes past
         * <code>split.getStart() + split.getLength()</code>.
         */
        public WARCRecordReader(FileSplit split, JobConf jobConf)
            throws IOException
        {
            conf = jobConf;
            start = split.getStart();
            end = start + split.getLength();
            System.out.println("========== " + start + " " + end);

            configure(jobConf);

            // Open the file and seek to the start of the split
            Path file = split.getPath();
            FileSystem fs = file.getFileSystem(jobConf);
            fsin = new CountingInputStream(new BufferedInputStream(fs.open(split.getPath())));

            warcReader = WarcReaderFactory.getReader(fsin);
            // Start with the first valid record after offset "start"
            skipToNextRecord(start);
        }

        private void fillCrawlerRecord(WarcRecord record, CrawlerRecord crawlerRecord)
            throws IOException
        {
            // Fill CrawlerRecord
            WarcHeader header = record.header;
            // h.warcSegmentOriginIdUrl
            crawlerRecord.setURL(header.warcTargetUriStr);
            // Usually ARC records contain the original webpage, including markup
            crawlerRecord.setIsHTML(getContentType(record).contains("text/html"));
            crawlerRecord.setDate(header.warcDate);
            // Usually not present in ARC records
            crawlerRecord.setOriginalLanguage(null);

            // It is important to *not* try to decode anything before we know the encoding.
            // As soon as we put any of this record content into a string, it is explicitly
            // converted
            // from/to some encoding. So before that, we have to guess the correct encoding,
            // otherwise
            // we'll potentially trash our data before we process it.
            byte[] buffer = IOUtils.toByteArray(record.getPayloadContent());
            Class<?> encodingDetectorClass = conf.getClass("dkpro.input.encodingdetector", DummyEncodingDetector.class);
			try {
				EncodingDetector encodingDetector = (EncodingDetector)encodingDetectorClass.newInstance();
	            // This is the encoding we'll use to decode the bytes into text
	            String encoding = encodingDetector.getBestEncoding(buffer);
	            crawlerRecord.setOriginalEncoding(encoding);
	            crawlerRecord.setContent(new String(buffer, encoding));
			} catch (InstantiationException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (IllegalAccessException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
        }

        @Override
        public boolean next(Text key, CrawlerRecord value)
            throws IOException
        {
            WarcRecord arcRecord = null;
            boolean atEnd = false;
            long bufferMarkAtEnd = 0;

            while ((arcRecord = warcReader.getNextRecord()) != null) {
                // Check if arcReader has definitely read over end mark when considering that reader
                // is buffered
                // (meaning that (fsin.getCount() > end) is true before we've read the last record
                // before end)
                lastRecordEnd = fsin.getCount();
                if (!atEnd && lastRecordEnd >= end) {
                    atEnd = true;
                    bufferMarkAtEnd = lastRecordEnd;
                }
                else if (atEnd && lastRecordEnd > bufferMarkAtEnd) {
                    break;
                }
                try {
                    // Skip meta header (usually first record in archive)
                    if (WarcConstants.RT_WARCINFO.equals(arcRecord.header.warcTypeStr) || !arcRecord.hasPayload())
                        continue;

                    // Make sure only text content is read
                    if (conf.getBoolean("dkpro.input.filter-mimetypes",
                            false)) {

                    	// requests etc. are already filtered by mimetype but maybe in the future this needs to be revised
                    	// if (!(WarcConstants.RT_RESOURCE.equals(arcRecord.header.warcTypeStr) || WarcConstants.RT_RESPONSE.equals(arcRecord.header.warcTypeStr)))
                    	// continue;
                    	String contentTypeHeader = getContentType(arcRecord);

                        boolean skipContentType = true;
                        if (contentTypeHeader != null) {
                            for (String contentType : contentTypeWhitelist)
                                if (contentTypeHeader.contains(contentType)) {
                                    skipContentType = false;
                                    break;
                                }
                        }

                        if (skipContentType) {
                            continue;
                        }
                    }

                    fillCrawlerRecord(arcRecord, value);
                    key.set(value.getURL());

                    // System.out.println("READ");
                    return true;
                }
                catch (UnsupportedEncodingException e) {
                    // Skip unreadable records
                    System.err.println("WARNING: Skipping WARC record (byte offset "
                            + fsin.getCount()
                            + ") due to unsupported encoding. The record may contain binary data.");
                }
                catch (Exception e) {
                    // Skip any other records that produce exceptions
                    System.err
                            .println("WARNING: Skipping WARC record due to exception that occured while reading record:");
                    System.err.println(e.getMessage());
                    e.printStackTrace();
                }
            }

            return false;
        }
        
		private String getContentType(WarcRecord arcRecord) {
			String contentTypeHeader = arcRecord.header.contentTypeStr;
			if (arcRecord.getPayload().getPayloadHeaderWrapped() != null && arcRecord.getPayload().getPayloadHeaderWrapped().getHeader(WarcConstants.FN_CONTENT_TYPE) != null)
				contentTypeHeader = arcRecord.getPayload().getPayloadHeaderWrapped().getHeader(WarcConstants.FN_CONTENT_TYPE).value;
			return contentTypeHeader;
		}


        @Override
        public Text createKey()
        {
            return new Text();
        }

        @Override
        public CrawlerRecord createValue()
        {
            return new CrawlerRecord();
        }

        @Override
        public long getPos()
            throws IOException
        {
            return fsin.getCount();
        }

        @Override
        public void close()
            throws IOException
        {
            fsin.close();
        }

        @Override
        public float getProgress()
            throws IOException
        {
            return ((float) (fsin.getCount() - start)) / ((float) (end - start));
        }

        /*
         * ======================== ARC Logic ============================
         */

        /**
         * Skips InputStream to next record past <code>start</code> or not at all if start is
         * exactly the start of a record
         */
        private void skipToNextRecord(long start)
            throws IOException
        {
            // Skip record by record until (position in input stream) >= start
            while (fsin.getCount() < start && warcReader.getNextRecord() != null) {
            }
            ;

            lastRecordEnd = fsin.getCount();
        }
    }
}