/******************************************************************************* * Copyright 2013 * TU Darmstadt, FG Sprachtechnologie * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.bigdata.io.hadoop; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.LineRecordReader; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; /** * Generic version of LineRecordReader from org.apache.hadoop.mapred package. * <p> * Use this class if you need to read plain-text lines and emit non-text * values based on the lines. This reader reads all lines in one input split in one * aggregated Text value. * <code>convertValue()</code> needs to be implemented by your subclass. * </p> * <p> * Useful for e.g. generating UIMA CASes from plain text. In this case, the key would be * the offset of the input split, and the value a CAS initialized with all lines in the input split. * </p> * * @author Johannes Simon */ public abstract class GenericMultiLineRecordReader<V> implements RecordReader<Text, V> { private final LineRecordReader lineReader; private final FileSplit split; private int maxNumLinesPerSplit; public GenericMultiLineRecordReader(FileSplit split, JobConf jobConf, Reporter reporter) throws IOException { lineReader = new LineRecordReader(jobConf, split); this.split = split; maxNumLinesPerSplit = jobConf.getInt("dkpro.input.maxlinesperrecord", 1); } @Override public boolean next(Text key, V value) throws IOException { LongWritable lineKey = lineReader.createKey(); Text docKey = new Text(split.toString()); StringBuilder doc = new StringBuilder(); Text line = lineReader.createValue(); int lineCount = 0; while (lineReader.next(lineKey, line)) { // Document key is key of first line (in this split) doc.append(line); doc.append("\n"); lineCount++; if (lineCount == maxNumLinesPerSplit) { break; } } // success == true iff. we read at least one line boolean success = doc.length() > 0; if (success) { Text docValue = new Text(doc.toString()); convertValue(docKey, docValue, value); } return success; } // The following methods only delegate functionality to lineReader protected abstract void convertValue(Text longKey, Text textValue, V value); @Override public void close() throws IOException { lineReader.close(); } @Override public long getPos() throws IOException { return lineReader.getPos(); } @Override public float getProgress() throws IOException { return lineReader.getProgress(); } }