GenericMultiLineRecordReader.java example

Explorer

dkpro-bigdata-master
- dkpro-bigdata-collocations
  - src
    - main
      - java
        org
        dkpro
        bigdata
        collocations
        AssocReducer.java
        AssociationMetrics.java
        CollocCombiner.java
        CollocDriver.java
        CollocMapper.java
        CollocReducer.java
        Gram.java
        GramKey.java
        GramKeyGroupComparator.java
        GramKeyPartitioner.java
- dkpro-bigdata-examples
  - src
    - main
      - java
        org
        dkpro
        bigdata
        examples
        CasConsumerExample.java
        ExternalDataExample.java
        Text2CASExample.java
        UimaPipelineOnHadoop.java
- dkpro-bigdata-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        hadoop
        AnalysisEngineUtil.java
        DkproHadoopDriver.java
        DkproMapper.java
        DkproReducer.java
        EngineFactory.java
        FeatureCountHadoopDriver.java
        UIMAMapReduceBase.java
        XMLDescriptorRunner.java
    - test
      - java
        org
        dkpro
        bigdata
        hadoop
        CasConsumerOutputTest.java
        DkproHadoopDriverTest.java
- dkpro-bigdata-io-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormat.java
        BinCasWithTypeSystemWritable.java
        BinCasWritable.java
        CASWritable.java
        CASWritableSequenceFileWriter.java
        CollectionReaderWrapper.java
        CrawlerRecord.java
        DummyEncodingDetector.java
        EncodingDetector.java
        FormatConverterMapper.java
        GenericKeyValueLineRecordReader.java
        GenericMultiLineRecordReader.java
        HdfsResourceLoaderLocator.java
        LeipzigInputFormat.java
        MultiLineText2CASInputFormat.java
        Text2CASInputFormat.java
        WARCInputFormat.java
        XCASSequenceFileWriter.java
        XmiSequenceFileWriter.java
    - test
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormatTest.java
        BinCasWithTypeSystemWritableTest.java
        BinCasWritableTest.java
        CASWritableTest.java
        HdfsResourceLoaderLocatorTest.java
        InputFormatTest.java
        LeipzigInputFormatTest.java
        WARCInputFormatTest.java

/*******************************************************************************
 * Copyright 2013
 * TU Darmstadt, FG Sprachtechnologie
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.dkpro.bigdata.io.hadoop;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

/**
 * Generic version of LineRecordReader from org.apache.hadoop.mapred package.
 * <p>
 * Use this class if you need to read plain-text lines and emit non-text
 * values based on the lines. This reader reads all lines in one input split in one
 * aggregated Text value.
 * <code>convertValue()</code> needs to be implemented by your subclass.
 * </p>
 * <p>
 * Useful for e.g. generating UIMA CASes from plain text. In this case, the key would be
 * the offset of the input split, and the value a CAS initialized with all lines in the input split.
 * </p>
 * 
 * @author Johannes Simon
 */
public abstract class GenericMultiLineRecordReader<V> implements RecordReader<Text, V> {

	private final LineRecordReader lineReader;
	private final FileSplit split;
	private int maxNumLinesPerSplit;
	
	public GenericMultiLineRecordReader(FileSplit split, JobConf jobConf, Reporter reporter) throws IOException {
		lineReader = new LineRecordReader(jobConf, split);
		this.split = split;
		maxNumLinesPerSplit = jobConf.getInt("dkpro.input.maxlinesperrecord", 1);
	}
	
	@Override
	public boolean next(Text key, V value) throws IOException {
		LongWritable lineKey = lineReader.createKey();
		Text docKey = new Text(split.toString());
		StringBuilder doc = new StringBuilder();
		Text line = lineReader.createValue();
		int lineCount = 0;
		while (lineReader.next(lineKey, line)) {
			// Document key is key of first line (in this split)
			doc.append(line);
			doc.append("\n");
			lineCount++;
			if (lineCount == maxNumLinesPerSplit) {
				break;
			}
		}
		
		// success == true iff. we read at least one line
		boolean success = doc.length() > 0;
		if (success) {
			Text docValue = new Text(doc.toString());
			convertValue(docKey, docValue, value);
		}
		
		return success;
	}

	// The following methods only delegate functionality to lineReader
	
	protected abstract void convertValue(Text longKey, Text textValue, V value);

	@Override public void close() throws IOException { lineReader.close(); }
	@Override public long getPos() throws IOException { return lineReader.getPos(); }
	@Override public float getProgress() throws IOException { return lineReader.getProgress(); }
	
}