/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.utils.vectors;
import com.google.common.base.Charsets;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.util.HelpFormatter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
/**
* Can read in a {@link SequenceFile} of {@link Vector}s and dump
* out the results using {@link Vector#asFormatString()} to either the console or to a
* file.
*/
public final class VectorDumper {
private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
private VectorDumper() {
}
public static void main(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
"The Sequence File containing the Vectors").withShortName("s").create();
Option vectorAsKeyOpt = obuilder.withLongName("useKey").withRequired(false).withDescription(
"If the Key is a vector, then dump that instead").withShortName("u").create();
Option printKeyOpt = obuilder.withLongName("printKey").withRequired(false).withDescription(
"Print out the key as well, delimited by a tab (or the value if useKey is true)").withShortName("p")
.create();
Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
"The output file. If not specified, dumps to the console").withShortName("o").create();
Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
"The dictionary file. ").withShortName("d").create();
Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
"The dictionary file type (text|sequencefile)").withShortName("dt").create();
Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
"Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries")
.withShortName("c").create();
Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
"If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
.withShortName("n").create();
Option sortVectorsOpt = obuilder.withLongName("sortVectors").withRequired(false).withDescription(
"Sort output key/value pairs of the vector entries in abs magnitude descending order")
.withShortName("sort").create();
Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
withDescription("Dump only the size of the vector").withShortName("sz").create();
Option numItemsOpt = obuilder.withLongName("numItems").withRequired(false).withArgument(
abuilder.withName("n").withMinimum(1).withMaximum(1).create()).
withDescription("Output at most <n> vecors").withShortName("n").create();
Option numIndexesPerVectorOpt = obuilder.withLongName("vectorSize").withShortName("vs")
.withRequired(false).withArgument(abuilder.withName("vs").withMinimum(1)
.withMaximum(1).create())
.withDescription("Truncate vectors to <vs> length when dumping (most useful when in"
+ " conjunction with -sort").create();
Option filtersOpt = obuilder.withLongName("filter").withRequired(false).withArgument(
abuilder.withName("filter").withMinimum(1).withMaximum(100).create()).
withDescription("Only dump out those vectors whose name matches the filter." +
" Multiple items may be specified by repeating the argument.").withShortName("fi").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
.create();
Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt)
.withOption(dictTypeOpt).withOption(dictOpt).withOption(csvOpt)
.withOption(vectorAsKeyOpt).withOption(printKeyOpt).withOption(sortVectorsOpt)
.withOption(filtersOpt).withOption(helpOpt).withOption(numItemsOpt)
.withOption(sizeOpt).withOption(numIndexesPerVectorOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelpWithGenericOptions(group);
return;
}
if (cmdLine.hasOption(seqOpt)) {
Configuration conf = new Configuration();
Path pathPattern = new Path(cmdLine.getValue(seqOpt).toString());
FileSystem fs = FileSystem.get(conf);
FileStatus[] inputPaths = fs.globStatus(pathPattern);
String dictionaryType = "text";
if (cmdLine.hasOption(dictTypeOpt)) {
dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
}
boolean sortVectors = cmdLine.hasOption(sortVectorsOpt);
log.info("Sort? " + sortVectors);
String[] dictionary = null;
if (cmdLine.hasOption(dictOpt)) {
if ("text".equals(dictionaryType)) {
dictionary = VectorHelper.loadTermDictionary(new File(cmdLine.getValue(dictOpt).toString()));
} else if ("sequencefile".equals(dictionaryType)) {
dictionary = VectorHelper.loadTermDictionary(conf, cmdLine.getValue(dictOpt).toString());
} else {
throw new OptionException(dictTypeOpt);
}
}
Set<String> filters;
if (cmdLine.hasOption(filtersOpt)) {
filters = new HashSet<String>(cmdLine.getValues(filtersOpt));
} else {
filters = null;
}
boolean useCSV = cmdLine.hasOption(csvOpt);
boolean sizeOnly = cmdLine.hasOption(sizeOpt);
boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
Writer writer;
boolean shouldClose;
if (cmdLine.hasOption(outputOpt)) {
shouldClose = true;
writer = Files.newWriter(new File(cmdLine.getValue(outputOpt).toString()), Charsets.UTF_8);
} else {
shouldClose = false;
writer = new OutputStreamWriter(System.out);
}
try {
boolean printKey = cmdLine.hasOption(printKeyOpt);
if (useCSV && dictionary != null) {
writer.write("#");
for (int j = 0; j < dictionary.length; j++) {
writer.write(dictionary[j]);
if (j < dictionary.length - 1) {
writer.write(',');
}
}
writer.write('\n');
}
Long numItems = null;
if (cmdLine.hasOption(numItemsOpt)) {
numItems = Long.parseLong(cmdLine.getValue(numItemsOpt).toString());
writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
}
int maxIndexesPerVector = cmdLine.hasOption(numIndexesPerVectorOpt)
? Integer.parseInt(cmdLine.getValue(numIndexesPerVectorOpt).toString())
: Integer.MAX_VALUE;
long itemCount = 0;
int fileCount = 0;
for (FileStatus stat : inputPaths) {
if (numItems != null && numItems <= itemCount) {
break;
}
Path path = stat.getPath();
log.info("Processing file '{}' ({}/{})",
new Object[]{path, ++fileCount, inputPaths.length});
SequenceFileIterable<Writable, Writable> iterable =
new SequenceFileIterable<Writable, Writable>(path, true, conf);
Iterator<Pair<Writable,Writable>> iterator = iterable.iterator();
long i = 0;
while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
Pair<Writable, Writable> record = iterator.next();
Writable keyWritable = record.getFirst();
Writable valueWritable = record.getSecond();
if (printKey) {
Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
writer.write(notTheVectorWritable.toString());
writer.write('\t');
}
VectorWritable vectorWritable =
(VectorWritable) (transposeKeyValue ? keyWritable : valueWritable);
Vector vector = vectorWritable.get();
if (filters != null
&& vector instanceof NamedVector
&& !filters.contains(((NamedVector)vector).getName())){
//we are filtering out this item, skip
continue;
}
if (sizeOnly) {
if (vector instanceof NamedVector) {
writer.write(((NamedVector) vector).getName());
writer.write(":");
} else {
writer.write(String.valueOf(i++));
writer.write(":");
}
writer.write(String.valueOf(vector.size()));
writer.write('\n');
} else {
String fmtStr;
if (useCSV) {
fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
} else {
fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
sortVectors);
}
writer.write(fmtStr);
writer.write('\n');
}
itemCount++;
}
}
writer.flush();
} finally {
if (shouldClose) {
Closeables.closeQuietly(writer);
}
}
}
} catch (OptionException e) {
log.error("Exception", e);
printHelp(group);
}
}
private static void printHelp(Group group) {
HelpFormatter formatter = new HelpFormatter();
formatter.setGroup(group);
formatter.print();
}
}