DocumentProcessorTest.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.vectorizer;

import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.junit.Test;

import java.util.Arrays;

/**
 * Tests tokenizing of {@link SequenceFile}s containing document ID and text (both as {@link Text})
 * by the {@link DocumentProcessor} into {@link SequenceFile}s of document ID and tokens (as
 * {@link StringTuple}).
 */
public class DocumentProcessorTest extends MahoutTestCase {

  @Test
  public void testTokenizeDocuments() throws Exception {
    Configuration configuration = new Configuration();
    FileSystem fs = FileSystem.get(configuration);
    Path input = new Path(getTestTempDirPath(), "inputDir");
    Path output = new Path(getTestTempDirPath(), "outputDir");

    String documentId1 = "123";
    String documentId2 = "456";

    SequenceFile.Writer writer = new SequenceFile.Writer(fs, configuration, input, Text.class, Text.class);
    try {
      String text1 = "A test for the document processor";
      writer.append(new Text(documentId1), new Text(text1));
      String text2 = "and another one";
      writer.append(new Text(documentId2), new Text(text2));
    } finally {
      Closeables.closeQuietly(writer);
    }

    DocumentProcessor.tokenizeDocuments(input, DefaultAnalyzer.class, output, configuration);

    FileStatus[] statuses = fs.listStatus(output, PathFilters.logsCRCFilter());
    assertEquals(1, statuses.length);
    Path filePath = statuses[0].getPath();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, configuration);
    Text key = ClassUtils.instantiateAs((Class<? extends Text>) reader.getKeyClass(), Text.class);
    StringTuple value =
        ClassUtils.instantiateAs((Class<? extends StringTuple>) reader.getValueClass(), StringTuple.class);
    reader.next(key, value);
    assertEquals(documentId1, key.toString());
    assertEquals(Arrays.asList("test", "document", "processor"), value.getEntries());
    reader.next(key, value);
    assertEquals(documentId2, key.toString());
    assertEquals(Arrays.asList("another", "one"), value.getEntries());
  }
}