package org.apache.lucene.benchmark.byTask.feeds; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.utils.Config; /** * Represents content from a specified source, such as TREC, Reuters etc. A * {@link ContentSource} is responsible for creating {@link DocData} objects for * its documents to be consumed by {@link DocMaker}. It also keeps track * of various statistics, such as how many documents were generated, size in * bytes etc. * <p> * Supports the following configuration parameters: * <ul> * <li><b>content.source.forever</b> - specifies whether to generate documents * forever (<b>default=true</b>). * <li><b>content.source.verbose</b> - specifies whether messages should be * output by the content source (<b>default=false</b>). * <li><b>content.source.encoding</b> - specifies which encoding to use when * reading the files of that content source. Certain implementations may define * a default value if this parameter is not specified. (<b>default=null</b>). * <li><b>content.source.log.step</b> - specifies for how many documents a * message should be logged. If set to 0 it means no logging should occur. * <b>NOTE:</b> if verbose is set to false, logging should not occur even if * logStep is not 0 (<b>default=0</b>). * </ul> */ public abstract class ContentSource { private static final int BZIP = 0; private static final int OTHER = 1; private static final Map<String,Integer> extensionToType = new HashMap<String,Integer>(); static { extensionToType.put(".bz2", Integer.valueOf(BZIP)); extensionToType.put(".bzip", Integer.valueOf(BZIP)); } protected static final int BUFFER_SIZE = 1 << 16; // 64K private long bytesCount; private long totalBytesCount; private int docsCount; private int totalDocsCount; private Config config; protected boolean forever; protected int logStep; protected boolean verbose; protected String encoding; private CompressorStreamFactory csFactory = new CompressorStreamFactory(); protected final synchronized void addBytes(long numBytes) { bytesCount += numBytes; totalBytesCount += numBytes; } protected final synchronized void addDoc() { ++docsCount; ++totalDocsCount; } /** * A convenience method for collecting all the files of a content source from * a given directory. The collected {@link File} instances are stored in the * given <code>files</code>. */ protected final void collectFiles(File dir, ArrayList<File> files) { if (!dir.canRead()) { return; } File[] dirFiles = dir.listFiles(); Arrays.sort(dirFiles); for (int i = 0; i < dirFiles.length; i++) { File file = dirFiles[i]; if (file.isDirectory()) { collectFiles(file, files); } else if (file.canRead()) { files.add(file); } } } /** * Returns an {@link InputStream} over the requested file. This method * attempts to identify the appropriate {@link InputStream} instance to return * based on the file name (e.g., if it ends with .bz2 or .bzip, return a * 'bzip' {@link InputStream}). */ protected InputStream getInputStream(File file) throws IOException { // First, create a FileInputStream, as this will be required by all types. // Wrap with BufferedInputStream for better performance InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE); String fileName = file.getName(); int idx = fileName.lastIndexOf('.'); int type = OTHER; if (idx != -1) { Integer typeInt = extensionToType.get(fileName.substring(idx)); if (typeInt != null) { type = typeInt.intValue(); } } switch (type) { case BZIP: try { // According to BZip2CompressorInputStream's code, it reads the first // two file header chars ('B' and 'Z'). It is important to wrap the // underlying input stream with a buffered one since // Bzip2CompressorInputStream uses the read() method exclusively. is = csFactory.createCompressorInputStream("bzip2", is); } catch (CompressorException e) { IOException ioe = new IOException(e.getMessage()); ioe.initCause(e); throw ioe; } break; default: // Do nothing, stay with FileInputStream } return is; } /** * Returns true whether it's time to log a message (depending on verbose and * the number of documents generated). */ protected final boolean shouldLog() { return verbose && logStep > 0 && docsCount % logStep == 0; } /** Called when reading from this content source is no longer required. */ public abstract void close() throws IOException; /** Returns the number of bytes generated since last reset. */ public final long getBytesCount() { return bytesCount; } /** Returns the number of generated documents since last reset. */ public final int getDocsCount() { return docsCount; } public final Config getConfig() { return config; } /** Returns the next {@link DocData} from the content source. */ public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException; /** Returns the total number of bytes that were generated by this source. */ public final long getTotalBytesCount() { return totalBytesCount; } /** Returns the total number of generated documents. */ public final int getTotalDocsCount() { return totalDocsCount; } /** * Resets the input for this content source, so that the test would behave as * if it was just started, input-wise. * <p> * <b>NOTE:</b> the default implementation resets the number of bytes and * documents generated since the last reset, so it's important to call * super.resetInputs in case you override this method. */ public void resetInputs() throws IOException { bytesCount = 0; docsCount = 0; } /** * Sets the {@link Config} for this content source. If you override this * method, you must call super.setConfig. */ public void setConfig(Config config) { this.config = config; forever = config.get("content.source.forever", true); logStep = config.get("content.source.log.step", 0); verbose = config.get("content.source.verbose", false); encoding = config.get("content.source.encoding", null); } }