BayesFileFormatter.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.classifier;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.List;

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Flatten a file into format that can be read by the Bayes M/R job.
 * <p/>
 * One document per line, first token is the label followed by a tab, rest of the line are the terms.
 */
public final class BayesFileFormatter {
  
  private static final Logger log = LoggerFactory.getLogger(BayesFileFormatter.class);
  
  private BayesFileFormatter() { }
  
  /**
   * Collapse all the files in the inputDir into a single file in the proper Bayes format, 1 document per line
   * 
   * @param label
   *          The label
   * @param analyzer
   *          The analyzer to use
   * @param inputDir
   *          The input Directory
   * @param charset
   *          The charset of the input files
   * @param outputFile
   *          The file to collapse to
   */
  public static void collapse(String label, Analyzer analyzer, File inputDir,
                              Charset charset, File outputFile) throws IOException {
    Writer writer = Files.newWriter(outputFile, charset);
    try {
      inputDir.listFiles(new FileProcessor(label, analyzer, charset, writer));
      // listFiles() is called here as a way to recursively visit files,
      // actually
    } finally {
      Closeables.closeQuietly(writer);
    }
  }
  
  /**
   * Write the input files to the outdir, one output file per input file
   * 
   * @param label
   *          The label of the file
   * @param analyzer
   *          The analyzer to use
   * @param input
   *          The input file or directory. May not be null
   * @param charset
   *          The Character set of the input files
   * @param outDir
   *          The output directory. Files will be written there with the same name as the input file
   */
  public static void format(String label, Analyzer analyzer, File input,
                            Charset charset, File outDir) throws IOException {
    if (input.isDirectory()) {
      input.listFiles(new FileProcessor(label, analyzer, charset, outDir));
    } else {
      Writer writer = Files.newWriter(new File(outDir, input.getName()), charset);
      try {
        writeFile(label, analyzer, input, charset, writer);
      } finally {
        Closeables.closeQuietly(writer);
      }
    }
  }
  
  /**
   * Hack the FileFilter mechanism so that we don't get stuck on large directories and 
   * don't have to loop the list twice
   */
  private static final class FileProcessor implements FileFilter {
    private final String label;
    
    private final Analyzer analyzer;
    
    private File outputDir;
    
    private final Charset charset;
    
    private Writer writer;
    
    /**
     * Use this when you want to collapse all files to a single file
     * 
     * @param label
     *          The label
     * @param writer
     *          must not be null and will not be closed
     */
    private FileProcessor(String label, Analyzer analyzer, Charset charset, Writer writer) {
      this.label = label;
      this.analyzer = analyzer;
      this.charset = charset;
      this.writer = writer;
    }
    
    /**
     * Use this when you want a writer per file
     * 
     * @param outputDir
     *          must not be null.
     */
    private FileProcessor(String label, Analyzer analyzer, Charset charset, File outputDir) {
      this.label = label;
      this.analyzer = analyzer;
      this.charset = charset;
      this.outputDir = outputDir;
    }
    
    @Override
    public boolean accept(File file) {
      if (file.isFile()) {
        Writer theWriter = null;
        try {
          if (writer == null) {
            theWriter = Files.newWriter(new File(outputDir, file.getName()), charset);
          } else {
            theWriter = writer;
          }
          writeFile(label, analyzer, file, charset, theWriter);
          if (writer != null) {
            // just write a new line
            theWriter.write('\n');
          }
        } catch (IOException e) {
          // TODO: report failed files instead of throwing exception
          throw new IllegalStateException(e);
        } finally {
          if (writer == null) {
            Closeables.closeQuietly(theWriter);
          }
        }
      } else {
        file.listFiles(this);
      }
      return false;
    }
  }
  
  /**
   * Write the tokens and the label from the Reader to the writer
   * 
   * @param label
   *          The label
   * @param analyzer
   *          The analyzer to use
   * @param inFile
   *          the file to read and whose contents are passed to the analyzer
   * @param charset
   *          character encoding to assume when reading the input file
   * @param writer
   *          The Writer, is not closed by this method
   * @throws java.io.IOException
   *           if there was a problem w/ the reader
   */
  private static void writeFile(String label, Analyzer analyzer, File inFile,
                                Charset charset, Writer writer) throws IOException {
    Reader reader = Files.newReader(inFile, charset);
    try {
      TokenStream ts = analyzer.reusableTokenStream(label, reader);
      writer.write(label);
      writer.write('\t'); // edit: Inorder to match Hadoop standard
      // TextInputFormat
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        char[] termBuffer = termAtt.buffer();
        int termLen = termAtt.length();
        writer.write(termBuffer, 0, termLen);
        writer.write(' ');
      }
    } finally {
      Closeables.closeQuietly(reader);
    }
  }
  
  /**
   * Convert a Reader to a vector
   * 
   * @param analyzer
   *          The Analyzer to use
   * @param reader
   *          The reader to feed to the Analyzer
   * @return An array of unique tokens
   */
  public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
    TokenStream ts = analyzer.reusableTokenStream("", reader);
    
    List<String> coll = Lists.newArrayList();
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      char[] termBuffer = termAtt.buffer();
      int termLen = termAtt.length();
      String val = new String(termBuffer, 0, termLen);
      coll.add(val);
    }
    return coll.toArray(new String[coll.size()]);
  }
  
  /**
   * Run the FileFormatter
   * 
   * @param args
   *          The input args. Run with -h to see the help
   * @throws ClassNotFoundException
   *           if the Analyzer can't be found
   * @throws IllegalAccessException
   *           if the Analyzer can't be constructed
   * @throws InstantiationException
   *           if the Analyzer can't be constructed
   * @throws IOException
   *           if the files can't be dealt with properly
   */
  public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    
    Option inputOpt = DefaultOptionCreator.inputOption().create();
    
    Option outputOpt = DefaultOptionCreator.outputOption().create();
    
    Option labelOpt = obuilder.withLongName("label").withRequired(true).withArgument(
      abuilder.withName("label").withMinimum(1).withMaximum(1).create()).withDescription("The label of the file")
        .withShortName("l").create();
    
    Option analyzerOpt = obuilder.withLongName("analyzer").withArgument(
      abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).withDescription(
      "The fully qualified class name of the analyzer to use. "
          + "Must have a no-arg constructor.  Default is the StandardAnalyzer").withShortName("a").create();
    
    Option charsetOpt = obuilder.withLongName("charset").withArgument(
      abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).withDescription(
      "The character encoding of the input file").withShortName("c").create();
    
    Option collapseOpt = obuilder.withLongName("collapse").withRequired(true).withArgument(
      abuilder.withName("collapse").withMinimum(1).withMaximum(1).create()).withDescription(
      "Collapse a whole directory to a single file, one doc per line").withShortName("p").create();
    
    Option helpOpt = DefaultOptionCreator.helpOption();
    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(labelOpt)
        .withOption(analyzerOpt).withOption(charsetOpt).withOption(collapseOpt).withOption(helpOpt).create();
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);
      
      if (cmdLine.hasOption(helpOpt)) {
        
        return;
      }
      File input = new File((String) cmdLine.getValue(inputOpt));
      File output = new File((String) cmdLine.getValue(outputOpt));
      String label = (String) cmdLine.getValue(labelOpt);
      Analyzer analyzer;
      if (cmdLine.hasOption(analyzerOpt)) {
        analyzer = ClassUtils.instantiateAs((String) cmdLine.getValue(analyzerOpt), Analyzer.class);
      } else {
        analyzer = new StandardAnalyzer(Version.LUCENE_31);
      }
      Charset charset = Charsets.UTF_8;
      if (cmdLine.hasOption(charsetOpt)) {
        charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
      }
      boolean collapse = cmdLine.hasOption(collapseOpt);
      
      if (collapse) {
        collapse(label, analyzer, input, charset, output);
      } else {
        format(label, analyzer, input, charset, output);
      }
      
    } catch (OptionException e) {
      log.error("Exception", e);
    }
  }
}