/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.classifier;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.List;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Flatten a file into format that can be read by the Bayes M/R job.
* <p/>
* One document per line, first token is the label followed by a tab, rest of the line are the terms.
*/
public final class BayesFileFormatter {
private static final Logger log = LoggerFactory.getLogger(BayesFileFormatter.class);
private BayesFileFormatter() { }
/**
* Collapse all the files in the inputDir into a single file in the proper Bayes format, 1 document per line
*
* @param label
* The label
* @param analyzer
* The analyzer to use
* @param inputDir
* The input Directory
* @param charset
* The charset of the input files
* @param outputFile
* The file to collapse to
*/
public static void collapse(String label, Analyzer analyzer, File inputDir,
Charset charset, File outputFile) throws IOException {
Writer writer = Files.newWriter(outputFile, charset);
try {
inputDir.listFiles(new FileProcessor(label, analyzer, charset, writer));
// listFiles() is called here as a way to recursively visit files,
// actually
} finally {
Closeables.closeQuietly(writer);
}
}
/**
* Write the input files to the outdir, one output file per input file
*
* @param label
* The label of the file
* @param analyzer
* The analyzer to use
* @param input
* The input file or directory. May not be null
* @param charset
* The Character set of the input files
* @param outDir
* The output directory. Files will be written there with the same name as the input file
*/
public static void format(String label, Analyzer analyzer, File input,
Charset charset, File outDir) throws IOException {
if (input.isDirectory()) {
input.listFiles(new FileProcessor(label, analyzer, charset, outDir));
} else {
Writer writer = Files.newWriter(new File(outDir, input.getName()), charset);
try {
writeFile(label, analyzer, input, charset, writer);
} finally {
Closeables.closeQuietly(writer);
}
}
}
/**
* Hack the FileFilter mechanism so that we don't get stuck on large directories and
* don't have to loop the list twice
*/
private static final class FileProcessor implements FileFilter {
private final String label;
private final Analyzer analyzer;
private File outputDir;
private final Charset charset;
private Writer writer;
/**
* Use this when you want to collapse all files to a single file
*
* @param label
* The label
* @param writer
* must not be null and will not be closed
*/
private FileProcessor(String label, Analyzer analyzer, Charset charset, Writer writer) {
this.label = label;
this.analyzer = analyzer;
this.charset = charset;
this.writer = writer;
}
/**
* Use this when you want a writer per file
*
* @param outputDir
* must not be null.
*/
private FileProcessor(String label, Analyzer analyzer, Charset charset, File outputDir) {
this.label = label;
this.analyzer = analyzer;
this.charset = charset;
this.outputDir = outputDir;
}
@Override
public boolean accept(File file) {
if (file.isFile()) {
Writer theWriter = null;
try {
if (writer == null) {
theWriter = Files.newWriter(new File(outputDir, file.getName()), charset);
} else {
theWriter = writer;
}
writeFile(label, analyzer, file, charset, theWriter);
if (writer != null) {
// just write a new line
theWriter.write('\n');
}
} catch (IOException e) {
// TODO: report failed files instead of throwing exception
throw new IllegalStateException(e);
} finally {
if (writer == null) {
Closeables.closeQuietly(theWriter);
}
}
} else {
file.listFiles(this);
}
return false;
}
}
/**
* Write the tokens and the label from the Reader to the writer
*
* @param label
* The label
* @param analyzer
* The analyzer to use
* @param inFile
* the file to read and whose contents are passed to the analyzer
* @param charset
* character encoding to assume when reading the input file
* @param writer
* The Writer, is not closed by this method
* @throws java.io.IOException
* if there was a problem w/ the reader
*/
private static void writeFile(String label, Analyzer analyzer, File inFile,
Charset charset, Writer writer) throws IOException {
Reader reader = Files.newReader(inFile, charset);
try {
TokenStream ts = analyzer.reusableTokenStream(label, reader);
writer.write(label);
writer.write('\t'); // edit: Inorder to match Hadoop standard
// TextInputFormat
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
char[] termBuffer = termAtt.buffer();
int termLen = termAtt.length();
writer.write(termBuffer, 0, termLen);
writer.write(' ');
}
} finally {
Closeables.closeQuietly(reader);
}
}
/**
* Convert a Reader to a vector
*
* @param analyzer
* The Analyzer to use
* @param reader
* The reader to feed to the Analyzer
* @return An array of unique tokens
*/
public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
TokenStream ts = analyzer.reusableTokenStream("", reader);
List<String> coll = Lists.newArrayList();
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
char[] termBuffer = termAtt.buffer();
int termLen = termAtt.length();
String val = new String(termBuffer, 0, termLen);
coll.add(val);
}
return coll.toArray(new String[coll.size()]);
}
/**
* Run the FileFormatter
*
* @param args
* The input args. Run with -h to see the help
* @throws ClassNotFoundException
* if the Analyzer can't be found
* @throws IllegalAccessException
* if the Analyzer can't be constructed
* @throws InstantiationException
* if the Analyzer can't be constructed
* @throws IOException
* if the files can't be dealt with properly
*/
public static void main(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
Option inputOpt = DefaultOptionCreator.inputOption().create();
Option outputOpt = DefaultOptionCreator.outputOption().create();
Option labelOpt = obuilder.withLongName("label").withRequired(true).withArgument(
abuilder.withName("label").withMinimum(1).withMaximum(1).create()).withDescription("The label of the file")
.withShortName("l").create();
Option analyzerOpt = obuilder.withLongName("analyzer").withArgument(
abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).withDescription(
"The fully qualified class name of the analyzer to use. "
+ "Must have a no-arg constructor. Default is the StandardAnalyzer").withShortName("a").create();
Option charsetOpt = obuilder.withLongName("charset").withArgument(
abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).withDescription(
"The character encoding of the input file").withShortName("c").create();
Option collapseOpt = obuilder.withLongName("collapse").withRequired(true).withArgument(
abuilder.withName("collapse").withMinimum(1).withMaximum(1).create()).withDescription(
"Collapse a whole directory to a single file, one doc per line").withShortName("p").create();
Option helpOpt = DefaultOptionCreator.helpOption();
Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(labelOpt)
.withOption(analyzerOpt).withOption(charsetOpt).withOption(collapseOpt).withOption(helpOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
if (cmdLine.hasOption(helpOpt)) {
return;
}
File input = new File((String) cmdLine.getValue(inputOpt));
File output = new File((String) cmdLine.getValue(outputOpt));
String label = (String) cmdLine.getValue(labelOpt);
Analyzer analyzer;
if (cmdLine.hasOption(analyzerOpt)) {
analyzer = ClassUtils.instantiateAs((String) cmdLine.getValue(analyzerOpt), Analyzer.class);
} else {
analyzer = new StandardAnalyzer(Version.LUCENE_31);
}
Charset charset = Charsets.UTF_8;
if (cmdLine.hasOption(charsetOpt)) {
charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
}
boolean collapse = cmdLine.hasOption(collapseOpt);
if (collapse) {
collapse(label, analyzer, input, charset, output);
} else {
format(label, analyzer, input, charset, output);
}
} catch (OptionException e) {
log.error("Exception", e);
}
}
}