ARCSplitReader.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.hadoop.io;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.ArcFileReader;

/**
 * A Hadooop {@link RecordReader} for reading {@link ARCSplit}s.
 * 
 * @author Albert Chern
 */
public class ARCSplitReader implements RecordReader<Text, ArcFileItem> {

  private class IOThread extends Thread {

    @Override
    public void run() {

      for (int i = 0; i < readers.length; i++) {

        ARCResource resource = split.getResources()[i];
        ArcFileReader reader = readers[i];
        InputStream stream = null;
        long streamPosition = 0;
        int failures = 0;
        Throwable lastError = null;

        while (true) {

          try {
            stream = source.getStream(resource.getName(), streamPosition, lastError, failures);
            if (stream == null) {
              // The ARCSource is telling us to fail
              LOG.fatal("IOThread exiting", lastError);
              error = lastError;
              return;
            }
            ReadableByteChannel channel = Channels.newChannel(stream);

            while (true) {
              // Just keep passing buffers to the reader. The
              // reader is supposed to limit the number of buffers
              // it will accept and block when it is at the limit.
              ByteBuffer buffer = ByteBuffer.allocate(blockSize);
              int bytesRead = channel.read(buffer);

              if (bytesRead > 0) {
                streamPosition += bytesRead;
                totalBytesRead += bytesRead;
                buffer.flip();
                reader.available(buffer);
              } else if (bytesRead == -1) {
                // Create the next reader before closing this
                // one so that we don't get a race condition
                // where the other thread tries to access the
                // next reader before it is instantiated.
                if (i + 1 < readers.length) {
                  readers[i + 1] = new ArcFileReader();
                }
                reader.finished();
                break;
              }
            }

            break;

          } catch (Throwable t1) {
            lastError = t1;
            failures++;
          } finally {
            try {
              if (stream != null) {
                stream.close();
              }
            } catch (Throwable t2) {
            }
            stream = null;
          }
        }
      }
    }
  }

  private static final Log LOG = LogFactory.getLog(ARCSplitReader.class);

  public static final String SPLIT_DETAILS = "arc.reader.split.details";
  private ARCSplit split;
  private ARCSource source;
  private int blockSize;
  private ArcFileReader[] readers;
  private int readerIndex;
  private long totalBytesRead;

  private Throwable error;

  /**
   * Creates a new <tt>ARCSplitReader</tt>.
   * 
   * @param split
   *          the {@link ARCSplit} to read
   * @param source
   *          the {@link ARCSource} to open resources with
   * @param blockSize
   *          the number of bytes at a time to read from each input stream
   */
  public ARCSplitReader(JobConf job, ARCSplit split, ARCSource source, int blockSize) {
    this.split = split;
    // record split details in job config for debugging purposes ...
    job.set(SPLIT_DETAILS, split.toString());
    this.source = source;
    this.blockSize = blockSize;
    this.readers = new ArcFileReader[split.getResources().length];
    this.readers[0] = new ArcFileReader();
    this.readerIndex = 0;
    this.totalBytesRead = 0;
    this.error = null;
    new IOThread().start();
  }

  /**
   * @inheritDoc
   */
  public void close() throws IOException {
  }

  /**
   * @inheritDoc
   */
  public Text createKey() {
    return new Text();
  }

  /**
   * @inheritDoc
   */
  public ArcFileItem createValue() {
    return new ArcFileItem();
  }

  /**
   * @inheritDoc
   */
  public long getPos() throws IOException {
    return totalBytesRead;
  }

  /**
   * @inheritDoc
   */
  public float getProgress() throws IOException {
    return totalBytesRead / (float) split.getLength();
  }

  /**
   * Gets the next {@link ArcFileItem} from the split.
   * 
   * @param item
   *          the {@link ArcFileItem} to populate
   * 
   * @return <tt>true</tt> if a new item was read, or <tt>false</tt> if there
   *         are no more
   * 
   * @throws IOException
   *           if an IO error or timeout occurs
   */
  public boolean next(ArcFileItem item) throws IOException {

    while (readerIndex < readers.length) {
      if (readers[readerIndex].hasMoreItems()) {
        // populate arc file path in item
        item.setArcFileName(split.getResources()[readerIndex].getName());
        try {
          // and then delegate to reader instance
          readers[readerIndex].getNextItem(item);
        } catch (IOException e) {
          LOG.error("IOException in ARCSplitReader.next().ArcFile:" + item.getArcFileName() + "\nException:"
              + StringUtils.stringifyException(e));
          throw e;
        } catch (Exception e) {
          LOG.error("Unknown Exception thrown in ARCSplitReader.next().ArcFile:" + item.getArcFileName()
              + "\nException:" + StringUtils.stringifyException(e));
          throw new RuntimeException(e);
        }
        return true;
      } else {
        readers[readerIndex++] = null;
      }
    }

    return false;
  }

  /**
   * @inheritDoc
   */
  public boolean next(Text key, ArcFileItem value) throws IOException {

    if (next(value)) {
      // set uri from given key ...
      key.set(value.getUri());
      // TODO: we are going to clear the uri field's dirty flag since it is
      // redundant
      // this means the uri will not be serialized with the rest of the item's
      // data...
      value.setFieldClean(ArcFileItem.Field_URI);
      return true;
    }
    return false;
  }
}