/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.vectorizer;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;
import java.util.LinkedList;
import java.util.List;
public class EncodedVectorsFromSequenceFilesTest extends MahoutTestCase {
private static final int NUM_DOCS = 100;
private Configuration conf;
private Path inputPath;
@Override
@Before
public void setUp() throws Exception {
super.setUp();
conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
inputPath = getTestTempFilePath("documents/docs.file");
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPath, Text.class, Text.class);
RandomDocumentGenerator gen = new RandomDocumentGenerator();
try {
for (int i = 0; i < NUM_DOCS; i++) {
writer.append(new Text("Document::ID::" + i), new Text(gen.getRandomDocument()));
}
} finally {
Closeables.closeQuietly(writer);
}
}
@Test
public void testCreate() throws Exception {
runTest(false, false);
}
@Test
public void testCreateNamed() throws Exception {
runTest(false, true);
}
@Test
public void testCreateSeq() throws Exception {
runTest(true, false);
}
@Test
public void testCreateSeqNamed() throws Exception {
runTest(true, true);
}
private void runTest(boolean sequential, boolean named) throws Exception {
Path tmpPath = getTestTempDirPath();
Path outputPath = new Path(tmpPath, "output");
List<String> argList = new LinkedList<String>();
argList.add("-i");
argList.add(inputPath.toString());
argList.add("-o");
argList.add(outputPath.toString());
if (sequential) {
argList.add("-seq");
}
if (named) {
argList.add("-nv");
}
String[] args = argList.toArray(new String[argList.size()]);
EncodedVectorsFromSequenceFiles.main(args);
SequenceFileDirIterator<Text, VectorWritable> iter = new SequenceFileDirIterator<Text, VectorWritable>(outputPath, PathType.LIST, PathFilters.partFilter(), null, true, conf);
int seen = 0;
while (iter.hasNext()) {
Pair<Text, VectorWritable> next = iter.next();
if (sequential && !named){
assertTrue(next.getSecond().get() instanceof SequentialAccessSparseVector);
} else if (named){
assertTrue(next.getSecond().get() instanceof NamedVector);
}
seen++;
}
assertEquals("Missed some vectors", NUM_DOCS, seen);
}
}