FixedWidthRecordReader.java example

Explorer

analytics_util-master
- components
  - common
    - src
      - main
        java
        com
        caseystella
        pmml
        config
        Config.java
        Consumer.java
        Model.java
        provider
        ConsumerProvider.java
        DataBinding.java
        ModelProvider.java
        util
        common
        enrich
        ExtractContent.java
        hadoop
        ingest
        AtomicPut.java
        Config.java
        DirDiff.java
        input
        Util.java
        fixed
        FixedWidthInputFormat.java
        FixedWidthReader.java
        FixedWidthRecordReader.java
        whole
        WholeFileInputFormat.java
        WholeFileRecordReader.java
        interpret
        fixed
        Config.java
        Field.java
        converter
        BinaryConverter.java
        Converter.java
        IConverter.java
        TextConverter.java
        xpath
        Config.java
        Field.java
  - pig
    - src
      - main
        java
        com
        caseystella
        util
        pig
        Helper.java
        loader
        FixedWidthLoader.java
        WholeFileLoader.java
        XPathLoader.java
        udf
        ContentExtractor.java
        GetToken.java
  - syntactic_analysis
    - src
      - main
        java
        com
        caseystella
        cli
        SummarizerCLI.java
        input
        CSVHandler.java
        InputHandler.java
        Mode.java
        SQLHandler.java
        output
        CursesVisualize.java
        parser
        DateBaseListener.java
        DateLexer.java
        DateListener.java
        DateParser.java
        summarize
        Summarizer.java
        Summary.java
        SynonymHandler.java
        TotalSummary.java
        TypedColumn.java
        TypedColumnWithModifier.java
        TypedColumnWithModifierAndValue.java
        type
        TypeHandler.java
        TypeInference.java
        ValueSummary.java
        util
        ConversionUtils.java
        JSONUtils.java
        LogLikelihood.java
      - test
        java
        com
        caseystella
        summarize
        SummarizerIntegrationTest.java
        type
        TypeImputerTest.java

package com.caseystella.util.common.hadoop.input.fixed;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

import java.io.IOException;

/**
 * Created by cstella on 9/3/14.
 */
public class FixedWidthRecordReader extends RecordReader<LongWritable, BytesWritable> {
  private static final Log LOG = LogFactory.getLog(LineRecordReader.class);

  private long start;
  private long pos;
  private long end;
  private FixedWidthReader in;
  private FSDataInputStream fileIn;
  private Seekable filePosition;
  private LongWritable key;
  private BytesWritable value;
  private boolean isCompressedInput;
  private Decompressor decompressor;
  private int width;


  public FixedWidthRecordReader(int width) {
      this.width = width;
  }

  public void initialize(InputSplit genericSplit,
                         TaskAttemptContext context
                        ) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null!=codec) {
      isCompressedInput = true;
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
          ((SplittableCompressionCodec)codec).createInputStream(
            fileIn, decompressor, start, end,
            SplittableCompressionCodec.READ_MODE.BYBLOCK);
          in = new FixedWidthReader(cIn);

        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else {
          in = new FixedWidthReader(codec.createInputStream(fileIn, decompressor));

        filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
        in = new FixedWidthReader(fileIn);

      filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
      start += in.readLine(new BytesWritable(new byte[width]));
    }
    this.pos = start;
  }



  private long getFilePosition() throws IOException {
    long retVal;
    if (isCompressedInput && null != filePosition) {
      retVal = filePosition.getPos();
    } else {
      retVal = pos;
    }
    return retVal;
  }

  public boolean nextKeyValue() throws IOException {
    if (key == null) {
      key = new LongWritable();
    }
    key.set(pos);
    if (value == null) {
      value = new BytesWritable(new byte[width]);
    }
    int newSize = in.readLine(value);
    pos += newSize;
    if (newSize < width) {
      key = null;
      value = null;
      return false;
    } else {
      return true;
    }
  }

  @Override
  public LongWritable getCurrentKey() {
    return key;
  }

  @Override
  public BytesWritable getCurrentValue() {
    return value;
  }

  /**
   * Get the progress within the split
   */
  public float getProgress() throws IOException {
    if (start == end) {
      return 0.0f;
    } else {
      return Math.min(1.0f, (getFilePosition() - start) / (float)(end - start));
    }
  }

  public synchronized void close() throws IOException {
    try {
      if (in != null) {
        in.close();
      }
    } finally {
      if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
      }
    }
  }
}