/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.vectorizer;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Test;
public class SparseVectorsFromSequenceFilesTest extends MahoutTestCase {
private static final int NUM_DOCS = 100;
private Configuration conf;
private Path inputPath;
private void setupDocs() throws IOException {
conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
inputPath = getTestTempFilePath("documents/docs.file");
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPath, Text.class, Text.class);
RandomDocumentGenerator gen = new RandomDocumentGenerator();
try {
for (int i = 0; i < NUM_DOCS; i++) {
writer.append(new Text("Document::ID::" + i), new Text(gen.getRandomDocument()));
}
} finally {
Closeables.closeQuietly(writer);
}
}
@Test
public void testCreateTermFrequencyVectors() throws Exception {
setupDocs();
runTest(false, false, -1, NUM_DOCS);
}
@Test
public void testCreateTermFrequencyVectorsNam() throws Exception {
setupDocs();
runTest(false, true, -1, NUM_DOCS);
}
@Test
public void testCreateTermFrequencyVectorsSeq() throws Exception {
setupDocs();
runTest(true, false, -1, NUM_DOCS);
}
@Test
public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
setupDocs();
runTest(true, true, -1, NUM_DOCS);
}
@Test
public void testPruning() throws Exception {
conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
inputPath = getTestTempFilePath("documents/docs.file");
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPath, Text.class, Text.class);
String [] docs = {"a b c", "a a a a a b", "a a a a a c"};
try {
for (int i = 0; i < docs.length; i++) {
writer.append(new Text("Document::ID::" + i), new Text(docs[i]));
}
} finally {
Closeables.closeQuietly(writer);
}
Path outPath = runTest(false, false, 2, docs.length);
Path tfidfVectors = new Path(outPath, "tfidf-vectors");
int count = 0;
Vector [] res = new Vector[docs.length];
for (VectorWritable value :
new SequenceFileDirValueIterable<VectorWritable>(
tfidfVectors, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
Vector v = value.get();
System.out.println(v);
assertEquals(2, v.size());
res[count] = v;
count++;
}
assertEquals(docs.length, count);
//the first doc should have two values, the second and third should have 1, since the a gets removed
assertEquals(2, res[0].getNumNondefaultElements());
assertEquals(1, res[1].getNumNondefaultElements());
assertEquals(1, res[2].getNumNondefaultElements());
}
private Path runTest(boolean sequential, boolean named, double maxDFSigma, int numDocs) throws Exception {
Path outputPath = getTestTempFilePath("output");
List<String> argList = new LinkedList<String>();
argList.add("-i");
argList.add(inputPath.toString());
argList.add("-o");
argList.add(outputPath.toString());
if (sequential) {
argList.add("-seq");
}
if (named) {
argList.add("-nv");
}
if (maxDFSigma >= 0){
argList.add("--maxDFSigma");
argList.add(String.valueOf(maxDFSigma));
}
String[] args = argList.toArray(new String[argList.size()]);
SparseVectorsFromSequenceFiles.main(args);
Path tfVectors = new Path(outputPath, "tf-vectors");
Path tfidfVectors = new Path(outputPath, "tfidf-vectors");
DictionaryVectorizerTest.validateVectors(conf, numDocs, tfVectors, sequential, named);
DictionaryVectorizerTest.validateVectors(conf, numDocs, tfidfVectors, sequential, named);
return outputPath;
}
}