/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors; import com.google.common.collect.Sets; import com.google.common.io.Closeables; import com.google.common.io.Files; import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.Iterator; import java.util.Set; /** * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump * out the results using {@link Vector#asFormatString()} to either the console or to a * file. */ public final class VectorDumper extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(VectorDumper.class); private VectorDumper() { } @Override public int run(String[] args) throws Exception { /** Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument( abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription( "The Sequence File containing the Vectors").withShortName("s").create(); Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument( abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create()) .withDescription("The directory containing Sequence File of Vectors") .withShortName("d").create(); */ addInputOption(); addOutputOption(); addOption("useKey", "u", "If the Key is a vector than dump that instead"); addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true"); addOption("dictionary", "d", "The dictionary file.", false); addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries"); addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " + "(if the vector is one) printing out the name"); addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)"); addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order"); addOption("quiet", "q", "Print only file contents"); addOption("sizeOnly", "sz", "Dump only the size of the vector"); addOption("numItems", "ni", "Output at most <n> vecors", false); addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort", false); addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path input = getInputPath(); FileStatus fileStatus = fs.getFileStatus(input); if (fileStatus.isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter())); } else { FileStatus[] inputPaths = fs.globStatus(input); pathArr = new Path[inputPaths.length]; int i = 0; for (FileStatus fstatus : inputPaths) { pathArr[i++] = fstatus.getPath(); } } String dictionaryType = getOption("dictionaryType", "text"); boolean sortVectors = hasOption("sortVectors"); boolean quiet = hasOption("quiet"); if (!quiet) { log.info("Sort? {}", sortVectors); } String[] dictionary = null; if (hasOption("dictionary")) { String dictFile = getOption("dictionary"); switch (dictionaryType) { case "text": dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); break; case "sequencefile": dictionary = VectorHelper.loadTermDictionary(conf, dictFile); break; default: //TODO: support Lucene's FST as a dictionary type throw new IOException("Invalid dictionary type: " + dictionaryType); } } Set<String> filters; if (hasOption("filter")) { filters = Sets.newHashSet(getOptions("filter")); } else { filters = null; } boolean useCSV = hasOption("csv"); boolean sizeOnly = hasOption("sizeOnly"); boolean nameOnly = hasOption("nameOnly"); boolean namesAsComments = hasOption("namesAsComments"); boolean transposeKeyValue = hasOption("vectorAsKey"); Writer writer; boolean shouldClose; File output = getOutputFile(); if (output != null) { shouldClose = true; log.info("Output file: {}", output); Files.createParentDirs(output); writer = Files.newWriter(output, Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { boolean printKey = hasOption("printKey"); if (useCSV && dictionary != null) { writer.write("#"); for (int j = 0; j < dictionary.length; j++) { writer.write(dictionary[j]); if (j < dictionary.length - 1) { writer.write(','); } } writer.write('\n'); } Long numItems = null; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (quiet) { writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); } } int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize")) : Integer.MAX_VALUE; long itemCount = 0; int fileCount = 0; for (Path path : pathArr) { if (numItems != null && numItems <= itemCount) { break; } if (quiet) { log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length); } SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf); Iterator<Pair<Writable, Writable>> iterator = iterable.iterator(); long i = 0; while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { Pair<Writable, Writable> record = iterator.next(); Writable keyWritable = record.getFirst(); Writable valueWritable = record.getSecond(); if (printKey) { Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; writer.write(notTheVectorWritable.toString()); writer.write('\t'); } Vector vector; try { vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get(); } catch (ClassCastException e) { if ((transposeKeyValue ? keyWritable : valueWritable) instanceof WeightedPropertyVectorWritable) { vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).getVector(); } else { throw e; } } if (filters == null || !(vector instanceof NamedVector) || filters.contains(((NamedVector) vector).getName())) { if (sizeOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write(":"); } else { writer.write(String.valueOf(i++)); writer.write(":"); } writer.write(String.valueOf(vector.size())); writer.write('\n'); } else if (nameOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write('\n'); } } else { String fmtStr; if (useCSV) { fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); } else { fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, sortVectors); } writer.write(fmtStr); writer.write('\n'); } itemCount++; } } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; } public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new VectorDumper(), args); } }