LineDocRecordReader.java example

Explorer
yarn-comment-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.contrib.index.example;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
import org.apache.hadoop.contrib.index.mapred.DocumentID;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;

/**
 * A simple RecordReader for LineDoc for plain text files where each line is a
 * doc. Each line is as follows: documentID<SPACE>op<SPACE>content<EOF>,
 * where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete"
 * for delete, or "u", "upd" or "update" for update.
 */
public class LineDocRecordReader implements
    RecordReader<DocumentID, LineDocTextAndOp> {
  private static final char SPACE = ' ';
  private static final char EOL = '\n';

  private long start;
  private long pos;
  private long end;
  private BufferedInputStream in;
  private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);

  /**
   * Provide a bridge to get the bytes from the ByteArrayOutputStream without
   * creating a new byte array.
   */
  private static class TextStuffer extends OutputStream {
    public Text target;

    public void write(int b) {
      throw new UnsupportedOperationException("write(byte) not supported");
    }

    public void write(byte[] data, int offset, int len) throws IOException {
      target.set(data, offset, len);
    }
  }

  private TextStuffer bridge = new TextStuffer();

  /**
   * Constructor
   * @param job
   * @param split  
   * @throws IOException
   */
  public LineDocRecordReader(Configuration job, FileSplit split)
      throws IOException {
    long start = split.getStart();
    long end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    InputStream in = fileIn;
    boolean skipFirstLine = false;
    if (start != 0) {
      skipFirstLine = true; // wait till BufferedInputStream to skip
      --start;
      fileIn.seek(start);
    }

    this.in = new BufferedInputStream(in);
    if (skipFirstLine) { // skip first line and re-establish "start".
      start += LineDocRecordReader.readData(this.in, null, EOL);
    }
    this.start = start;
    this.pos = start;
    this.end = end;
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.mapred.RecordReader#close()
   */
  public void close() throws IOException {
    in.close();
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.mapred.RecordReader#createKey()
   */
  public DocumentID createKey() {
    return new DocumentID();
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.mapred.RecordReader#createValue()
   */
  public LineDocTextAndOp createValue() {
    return new LineDocTextAndOp();
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.mapred.RecordReader#getPos()
   */
  public long getPos() throws IOException {
    return pos;
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.mapred.RecordReader#getProgress()
   */
  public float getProgress() throws IOException {
    if (start == end) {
      return 0.0f;
    } else {
      return Math.min(1.0f, (pos - start) / (float) (end - start));
    }
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object)
   */
  public synchronized boolean next(DocumentID key, LineDocTextAndOp value)
      throws IOException {
    if (pos >= end) {
      return false;
    }

    // key is document id, which are bytes until first space
    if (!readInto(key.getText(), SPACE)) {
      return false;
    }

    // read operation: i/d/u, or ins/del/upd, or insert/delete/update
    Text opText = new Text();
    if (!readInto(opText, SPACE)) {
      return false;
    }
    String opStr = opText.toString();
    DocumentAndOp.Op op;
    if (opStr.equals("i") || opStr.equals("ins") || opStr.equals("insert")) {
      op = DocumentAndOp.Op.INSERT;
    } else if (opStr.equals("d") || opStr.equals("del")
        || opStr.equals("delete")) {
      op = DocumentAndOp.Op.DELETE;
    } else if (opStr.equals("u") || opStr.equals("upd")
        || opStr.equals("update")) {
      op = DocumentAndOp.Op.UPDATE;
    } else {
      // default is insert
      op = DocumentAndOp.Op.INSERT;
    }
    value.setOp(op);

    if (op == DocumentAndOp.Op.DELETE) {
      return true;
    } else {
      // read rest of the line
      return readInto(value.getText(), EOL);
    }
  }

  private boolean readInto(Text text, char delimiter) throws IOException {
    buffer.reset();
    long bytesRead = readData(in, buffer, delimiter);
    if (bytesRead == 0) {
      return false;
    }
    pos += bytesRead;
    bridge.target = text;
    buffer.writeTo(bridge);
    return true;
  }

  private static long readData(InputStream in, OutputStream out, char delimiter)
      throws IOException {
    long bytes = 0;
    while (true) {

      int b = in.read();
      if (b == -1) {
        break;
      }
      bytes += 1;

      byte c = (byte) b;
      if (c == EOL || c == delimiter) {
        break;
      }

      if (c == '\r') {
        in.mark(1);
        byte nextC = (byte) in.read();
        if (nextC != EOL || c == delimiter) {
          in.reset();
        } else {
          bytes += 1;
        }
        break;
      }

      if (out != null) {
        out.write(c);
      }
    }
    return bytes;
  }
}