/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.vectorizer;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.junit.Test;
import java.util.Arrays;
/**
* Tests tokenizing of {@link SequenceFile}s containing document ID and text (both as {@link Text})
* by the {@link DocumentProcessor} into {@link SequenceFile}s of document ID and tokens (as
* {@link StringTuple}).
*/
public class DocumentProcessorTest extends MahoutTestCase {
@Test
public void testTokenizeDocuments() throws Exception {
Configuration configuration = new Configuration();
FileSystem fs = FileSystem.get(configuration);
Path input = new Path(getTestTempDirPath(), "inputDir");
Path output = new Path(getTestTempDirPath(), "outputDir");
String documentId1 = "123";
String documentId2 = "456";
SequenceFile.Writer writer = new SequenceFile.Writer(fs, configuration, input, Text.class, Text.class);
try {
String text1 = "A test for the document processor";
writer.append(new Text(documentId1), new Text(text1));
String text2 = "and another one";
writer.append(new Text(documentId2), new Text(text2));
} finally {
Closeables.closeQuietly(writer);
}
DocumentProcessor.tokenizeDocuments(input, DefaultAnalyzer.class, output, configuration);
FileStatus[] statuses = fs.listStatus(output, PathFilters.logsCRCFilter());
assertEquals(1, statuses.length);
Path filePath = statuses[0].getPath();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, configuration);
Text key = ClassUtils.instantiateAs((Class<? extends Text>) reader.getKeyClass(), Text.class);
StringTuple value =
ClassUtils.instantiateAs((Class<? extends StringTuple>) reader.getValueClass(), StringTuple.class);
reader.next(key, value);
assertEquals(documentId1, key.toString());
assertEquals(Arrays.asList("test", "document", "processor"), value.getEntries());
reader.next(key, value);
assertEquals(documentId2, key.toString());
assertEquals(Arrays.asList("another", "one"), value.getEntries());
}
}