WholeTextFileInputFormat.java example

Explorer

KOSHIK-master
- src
  - main
    - java
      - org
        apache
        avro
        mapreduce
        TextStats.java
        mahout
        text
        wikipedia
        XmlInputFormat.java
      - se
        lth
        cs
        koshik
        analysis
        ContentProcessor.java
        is2
        IS2Lemmatizer.java
        IS2POSTagger.java
        IS2SyntacticDependencyParser.java
        lth
        LTHSemanticRoleLabeler.java
        malt
        MaltParserProcessor.java
        opennlp
        SentenceDetectorProcessor.java
        stagger
        StaggerProcessor.java
        stanford
        StanfordTokenizer.java
        wikipedia
        TextConverter.java
        example
        AvroStats.java
        DocumentToText.java
        ExtractWikiSectionsLinks.java
        ReadDocument.java
        input
        TextFileImportMapper.java
        conll
        CoNLL2006FileImportMapper.java
        CoNLL2006Reader.java
        CoNLL2009FileImportMapper.java
        CoNLL2009Reader.java
        CoNLLFeature.java
        CoNLLReader.java
        wikipedia
        WikipediaImportMapper.java
        language
        EnglishWikipediaPage.java
        SwedishWikipediaPage.java
        WikipediaPage.java
        WikipediaPageFactory.java
        io
        hadoop
        WholeTextFileInputFormat.java
        model
        Annotation.java
        Document.java
        avro
        AvroAnnotation.java
        AvroDocument.java
        text
        RootToken.java
        Sentence.java
        Span.java
        Token.java
        wikipedia
        InternalLink.java
        Section.java
        util
        EnglishPipeline.java
        Import.java
        SwedishPipeline.java
  - src
    - main
      - java
        org
        apache
        avro
        mapreduce
        TextStats.java
        mahout
        text
        wikipedia
        XmlInputFormat.java
        se
        lth
        cs
        koshik
        analysis
        ContentProcessor.java
        is2
        IS2Lemmatizer.java
        IS2POSTagger.java
        IS2SyntacticDependencyParser.java
        lth
        CharacterMapper.java
        LTHSemanticRoleLabeler.java
        LTHSimpleChineseLemmatizer.java
        LTHStanfordChineseSegmenterWrapper.java
        SimpleSentenceDetector.java
        malt
        MaltParserProcessor.java
        opennlp
        SentenceDetectorProcessor.java
        stagger
        StaggerProcessor.java
        stanford
        StanfordTokenizer.java
        wikipedia
        TextConverter.java
        example
        AvroStats.java
        DocumentToCoNLL2009.java
        ExtractWikiSectionsLinks.java
        ReadDocument.java
        input
        TextFileImportMapper.java
        conll
        CoNLL2006FileImportMapper.java
        CoNLL2006Reader.java
        CoNLL2009FileImportMapper.java
        CoNLL2009Reader.java
        CoNLLFeature.java
        CoNLLReader.java
        wikipedia
        WikipediaImportMapper.java
        language
        ChineseWikipediaPage.java
        EnglishWikipediaPage.java
        SwedishWikipediaPage.java
        WikipediaPage.java
        WikipediaPageFactory.java
        io
        hadoop
        WholeTextFileInputFormat.java
        model
        Annotation.java
        Document.java
        avro
        AvroAnnotation.java
        AvroDocument.java
        text
        RootToken.java
        Sentence.java
        Span.java
        Token.java
        wikipedia
        InternalLink.java
        Section.java
        util
        ChinesePipeline.java
        EnglishPipeline.java
        Import.java
        SwedishPipeline.java

/**
 * KOSHIK is an NLP framework for large scale processing using Hadoop. 
 * Copyright © 2014 Peter Exner
 * 
 * This file is part of KOSHIK.
 *
 * KOSHIK is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * KOSHIK is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with KOSHIK.  If not, see <http://www.gnu.org/licenses/>.
 */

package se.lth.cs.koshik.io.hadoop;

import java.io.IOException;
import java.nio.charset.Charset;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.*;


public class WholeTextFileInputFormat extends FileInputFormat<Text, Text> {
	private static Charset charset = Charset.forName("UTF-8");

	@Override
	protected boolean isSplitable(JobContext context, Path file) {
		return false;
	}

	@Override
	public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
		WholeTextFileRecordReader wholeTextFileRecordReader = new WholeTextFileRecordReader(charset);
		wholeTextFileRecordReader.initialize(genericSplit, context);
		return wholeTextFileRecordReader;
	}

	public static Charset getCharset() {
		return charset;
	}

	public static void setCharset(Charset charset) {
		WholeTextFileInputFormat.charset = charset;
	}
	
	public static class WholeTextFileRecordReader extends RecordReader<Text, Text> {
		private Charset charset;
		private FileSplit fileSplit;
		private Configuration conf;
		private Text key = new Text();
		private Text value = new Text();
		private boolean processed = false;

		public WholeTextFileRecordReader(Charset charset) {
			this.charset = charset;
		}

		@Override
		public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
			this.fileSplit = (FileSplit) split;
			this.conf = context.getConfiguration();
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			if (!processed) {
				byte[] contents = new byte[(int) fileSplit.getLength()];
				Path file = fileSplit.getPath();
				key.set(file.getName());
				
				FileSystem fs = file.getFileSystem(conf);
				FSDataInputStream in = null;
				try {
					in = fs.open(file);
					IOUtils.readFully(in, contents, 0, contents.length);
					value.set(new String(contents, charset));
				} finally {
					IOUtils.closeStream(in);
				}
				
				processed = true;
				return true;
			}
			return false;
		}

		@Override
		public Text getCurrentKey() throws IOException, InterruptedException {
			return key;
		}

		@Override
		public Text getCurrentValue() throws IOException, InterruptedException {
			return value;
		}

		@Override
		public float getProgress() throws IOException {
			return processed ? 1.0f : 0.0f;
		}

		@Override
		public void close() throws IOException {
		}
	}
}