/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.utils.vectors;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.commons.io.Charsets;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Iterator;
import java.util.Set;
/**
* Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump
* out the results using {@link Vector#asFormatString()} to either the console or to a
* file.
*/
public final class VectorDumper extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
private VectorDumper() {
}
@Override
public int run(String[] args) throws Exception {
/**
Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
"The Sequence File containing the Vectors").withShortName("s").create();
Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
.withDescription("The directory containing Sequence File of Vectors")
.withShortName("d").create();
*/
addInputOption();
addOutputOption();
addOption("useKey", "u", "If the Key is a vector than dump that instead");
addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
addOption("dictionary", "d", "The dictionary file.", false);
addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries");
addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
+ "(if the vector is one) printing out the name");
addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude "
+ "descending order");
addOption("quiet", "q", "Print only file contents");
addOption("sizeOnly", "sz", "Dump only the size of the vector");
addOption("numItems", "ni", "Output at most <n> vecors", false);
addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in"
+ " conjunction with -sort", false);
addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter."
+ " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null));
if (parseArguments(args, false, true) == null) {
return -1;
}
Path[] pathArr;
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path input = getInputPath();
FileStatus fileStatus = fs.getFileStatus(input);
if (fileStatus.isDir()) {
pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
} else {
FileStatus[] inputPaths = fs.globStatus(input);
pathArr = new Path[inputPaths.length];
int i = 0;
for (FileStatus fstatus : inputPaths) {
pathArr[i++] = fstatus.getPath();
}
}
String dictionaryType = getOption("dictionaryType", "text");
boolean sortVectors = hasOption("sortVectors");
boolean quiet = hasOption("quiet");
if (!quiet) {
log.info("Sort? {}", sortVectors);
}
String[] dictionary = null;
if (hasOption("dictionary")) {
String dictFile = getOption("dictionary");
switch (dictionaryType) {
case "text":
dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
break;
case "sequencefile":
dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
break;
default:
//TODO: support Lucene's FST as a dictionary type
throw new IOException("Invalid dictionary type: " + dictionaryType);
}
}
Set<String> filters;
if (hasOption("filter")) {
filters = Sets.newHashSet(getOptions("filter"));
} else {
filters = null;
}
boolean useCSV = hasOption("csv");
boolean sizeOnly = hasOption("sizeOnly");
boolean nameOnly = hasOption("nameOnly");
boolean namesAsComments = hasOption("namesAsComments");
boolean transposeKeyValue = hasOption("vectorAsKey");
Writer writer;
boolean shouldClose;
File output = getOutputFile();
if (output != null) {
shouldClose = true;
log.info("Output file: {}", output);
Files.createParentDirs(output);
writer = Files.newWriter(output, Charsets.UTF_8);
} else {
shouldClose = false;
writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
}
try {
boolean printKey = hasOption("printKey");
if (useCSV && dictionary != null) {
writer.write("#");
for (int j = 0; j < dictionary.length; j++) {
writer.write(dictionary[j]);
if (j < dictionary.length - 1) {
writer.write(',');
}
}
writer.write('\n');
}
Long numItems = null;
if (hasOption("numItems")) {
numItems = Long.parseLong(getOption("numItems"));
if (quiet) {
writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
}
}
int maxIndexesPerVector = hasOption("vectorSize")
? Integer.parseInt(getOption("vectorSize"))
: Integer.MAX_VALUE;
long itemCount = 0;
int fileCount = 0;
for (Path path : pathArr) {
if (numItems != null && numItems <= itemCount) {
break;
}
if (quiet) {
log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
}
SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf);
Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
long i = 0;
while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
Pair<Writable, Writable> record = iterator.next();
Writable keyWritable = record.getFirst();
Writable valueWritable = record.getSecond();
if (printKey) {
Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
writer.write(notTheVectorWritable.toString());
writer.write('\t');
}
Vector vector;
try {
vector = ((VectorWritable)
(transposeKeyValue ? keyWritable : valueWritable)).get();
} catch (ClassCastException e) {
if ((transposeKeyValue ? keyWritable : valueWritable)
instanceof WeightedPropertyVectorWritable) {
vector =
((WeightedPropertyVectorWritable)
(transposeKeyValue ? keyWritable : valueWritable)).getVector();
} else {
throw e;
}
}
if (filters == null
|| !(vector instanceof NamedVector)
|| filters.contains(((NamedVector) vector).getName())) {
if (sizeOnly) {
if (vector instanceof NamedVector) {
writer.write(((NamedVector) vector).getName());
writer.write(":");
} else {
writer.write(String.valueOf(i++));
writer.write(":");
}
writer.write(String.valueOf(vector.size()));
writer.write('\n');
} else if (nameOnly) {
if (vector instanceof NamedVector) {
writer.write(((NamedVector) vector).getName());
writer.write('\n');
}
} else {
String fmtStr;
if (useCSV) {
fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
} else {
fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
sortVectors);
}
writer.write(fmtStr);
writer.write('\n');
}
itemCount++;
}
}
}
writer.flush();
} finally {
if (shouldClose) {
Closeables.close(writer, false);
}
}
return 0;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new VectorDumper(), args);
}
}