ContentSource.java example

Explorer
solrcene-master
package org.apache.lucene.benchmark.byTask.feeds;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.byTask.utils.Config;

/**
 * Represents content from a specified source, such as TREC, Reuters etc. A
 * {@link ContentSource} is responsible for creating {@link DocData} objects for
 * its documents to be consumed by {@link DocMaker}. It also keeps track
 * of various statistics, such as how many documents were generated, size in
 * bytes etc.
 * <p>
 * Supports the following configuration parameters:
 * <ul>
 * <li><b>content.source.forever</b> - specifies whether to generate documents
 * forever (<b>default=true</b>).
 * <li><b>content.source.verbose</b> - specifies whether messages should be
 * output by the content source (<b>default=false</b>).
 * <li><b>content.source.encoding</b> - specifies which encoding to use when
 * reading the files of that content source. Certain implementations may define
 * a default value if this parameter is not specified. (<b>default=null</b>).
 * <li><b>content.source.log.step</b> - specifies for how many documents a
 * message should be logged. If set to 0 it means no logging should occur.
 * <b>NOTE:</b> if verbose is set to false, logging should not occur even if
 * logStep is not 0 (<b>default=0</b>).
 * </ul>
 */
public abstract class ContentSource {
  
  private static final int BZIP = 0;
  private static final int OTHER = 1;
  private static final Map<String,Integer> extensionToType = new HashMap<String,Integer>();
  static {
    extensionToType.put(".bz2", Integer.valueOf(BZIP));
    extensionToType.put(".bzip", Integer.valueOf(BZIP));
  }
  
  protected static final int BUFFER_SIZE = 1 << 16; // 64K

  private long bytesCount;
  private long totalBytesCount;
  private int docsCount;
  private int totalDocsCount;
  private Config config;

  protected boolean forever;
  protected int logStep;
  protected boolean verbose;
  protected String encoding;
  
  private CompressorStreamFactory csFactory = new CompressorStreamFactory();

  protected final synchronized void addBytes(long numBytes) {
    bytesCount += numBytes;
    totalBytesCount += numBytes;
  }
  
  protected final synchronized void addDoc() {
    ++docsCount;
    ++totalDocsCount;
  }

  /**
   * A convenience method for collecting all the files of a content source from
   * a given directory. The collected {@link File} instances are stored in the
   * given <code>files</code>.
   */
  protected final void collectFiles(File dir, ArrayList<File> files) {
    if (!dir.canRead()) {
      return;
    }
    
    File[] dirFiles = dir.listFiles();
    Arrays.sort(dirFiles);
    for (int i = 0; i < dirFiles.length; i++) {
      File file = dirFiles[i];
      if (file.isDirectory()) {
        collectFiles(file, files);
      } else if (file.canRead()) {
        files.add(file);
      }
    }
  }

  /**
   * Returns an {@link InputStream} over the requested file. This method
   * attempts to identify the appropriate {@link InputStream} instance to return
   * based on the file name (e.g., if it ends with .bz2 or .bzip, return a
   * 'bzip' {@link InputStream}).
   */
  protected InputStream getInputStream(File file) throws IOException {
    // First, create a FileInputStream, as this will be required by all types.
    // Wrap with BufferedInputStream for better performance
    InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
    
    String fileName = file.getName();
    int idx = fileName.lastIndexOf('.');
    int type = OTHER;
    if (idx != -1) {
      Integer typeInt = extensionToType.get(fileName.substring(idx));
      if (typeInt != null) {
        type = typeInt.intValue();
      }
    }
    switch (type) {
      case BZIP:
        try {
          // According to BZip2CompressorInputStream's code, it reads the first 
          // two file header chars ('B' and 'Z'). It is important to wrap the
          // underlying input stream with a buffered one since
          // Bzip2CompressorInputStream uses the read() method exclusively.
          is = csFactory.createCompressorInputStream("bzip2", is);
        } catch (CompressorException e) {
          IOException ioe = new IOException(e.getMessage());
          ioe.initCause(e);
          throw ioe;
        }
        break;
      default: // Do nothing, stay with FileInputStream
    }
    
    return is;
  }
  
  /**
   * Returns true whether it's time to log a message (depending on verbose and
   * the number of documents generated).
   */
  protected final boolean shouldLog() {
    return verbose && logStep > 0 && docsCount % logStep == 0;
  }

  /** Called when reading from this content source is no longer required. */
  public abstract void close() throws IOException;
  
  /** Returns the number of bytes generated since last reset. */
  public final long getBytesCount() { return bytesCount; }

  /** Returns the number of generated documents since last reset. */
  public final int getDocsCount() { return docsCount; }
  
  public final Config getConfig() { return config; }

  /** Returns the next {@link DocData} from the content source. */
  public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException;

  /** Returns the total number of bytes that were generated by this source. */ 
  public final long getTotalBytesCount() { return totalBytesCount; }

  /** Returns the total number of generated documents. */
  public final int getTotalDocsCount() { return totalDocsCount; }

  /**
   * Resets the input for this content source, so that the test would behave as
   * if it was just started, input-wise.
   * <p>
   * <b>NOTE:</b> the default implementation resets the number of bytes and
   * documents generated since the last reset, so it's important to call
   * super.resetInputs in case you override this method.
   */
  public void resetInputs() throws IOException {
    bytesCount = 0;
    docsCount = 0;
  }

  /**
   * Sets the {@link Config} for this content source. If you override this
   * method, you must call super.setConfig.
   */
  public void setConfig(Config config) {
    this.config = config;
    forever = config.get("content.source.forever", true);
    logStep = config.get("content.source.log.step", 0);
    verbose = config.get("content.source.verbose", false);
    encoding = config.get("content.source.encoding", null);
  }

}