ItemSimilarityJobTest.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.cf.taste.hadoop.similarity.item;

import java.io.BufferedReader;
import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.regex.Pattern;

import com.google.common.base.Charsets;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
import org.apache.mahout.cf.taste.impl.TasteTestCase;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.CosineSimilarity;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
import org.apache.mahout.math.map.OpenIntLongHashMap;
import org.easymock.EasyMock;
import org.junit.Test;

/**
 * Unit tests for the mappers and reducers in org.apache.mahout.cf.taste.hadoop.similarity.item
 * some integration tests with tiny data sets at the end
 */
public final class ItemSimilarityJobTest extends TasteTestCase {

  private static final Pattern TAB = Pattern.compile("\t");

  /**
   * Tests {@link ItemSimilarityJob.MostSimilarItemPairsMapper}
   */
  @Test
  public void testMostSimilarItemsPairsMapper() throws Exception {

    OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap();
    indexItemIDMap.put(12, 12L);
    indexItemIDMap.put(34, 34L);
    indexItemIDMap.put(56, 56L);

    Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable>.Context context =
      EasyMock.createMock(Mapper.Context.class);

    context.write(new EntityEntityWritable(34L, 56L), new DoubleWritable(0.9));

    EasyMock.replay(context);

    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
    vector.set(12, 0.2);
    vector.set(56, 0.9);

    ItemSimilarityJob.MostSimilarItemPairsMapper mapper = new ItemSimilarityJob.MostSimilarItemPairsMapper();
    setField(mapper, "indexItemIDMap", indexItemIDMap);
    setField(mapper, "maxSimilarItemsPerItem", 1);

    mapper.map(new IntWritable(34), new VectorWritable(vector), context);

    EasyMock.verify(context);
  }

  /**
   * Tests {@link ItemSimilarityJob.MostSimilarItemPairsReducer}
   */
  @Test
  public void testMostSimilarItemPairsReducer() throws Exception {
    Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable>.Context context =
      EasyMock.createMock(Reducer.Context.class);

    context.write(new EntityEntityWritable(123L, 456L), new DoubleWritable(0.5));

    EasyMock.replay(context);

    new ItemSimilarityJob.MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L, 456L),
        Arrays.asList(new DoubleWritable(0.5), new DoubleWritable(0.5)), context);

    EasyMock.verify(context);
  }

  /**
   * Integration test with a tiny data set
   *
   * <pre>
   * user-item-matrix
   *
   *        Game   Mouse   PC    Disk
   * Jane    -       1      2      -
   * Paul    1       -      1      -
   * Fred    -       -      -      1
   * </pre>
   */
  @Test
  public void testCompleteJob() throws Exception {

    File inputFile = getTestTempFile("prefs.txt");
    File outputDir = getTestTempDir("output");
    outputDir.delete();
    File tmpDir = getTestTempDir("tmp");

    writeLines(inputFile,
        "2,1,1",
        "1,2,1",
        "3,4,1",
        "1,3,2",
        "2,3,1");

    ItemSimilarityJob similarityJob = new ItemSimilarityJob();

    Configuration conf = new Configuration();
    conf.set("mapred.input.dir", inputFile.getAbsolutePath());
    conf.set("mapred.output.dir", outputDir.getAbsolutePath());
    conf.setBoolean("mapred.output.compress", false);

    similarityJob.setConf(conf);

    similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
       CosineSimilarity.class.getName() });

    File outPart = outputDir.listFiles(new FilenameFilter() {
      @Override
      public boolean accept(File dir, String name) {
        return name.startsWith("part-");
      }
    })[0];
    BufferedReader reader = Files.newReader(outPart, Charsets.UTF_8);

    String line;
    int currentLine = 1;
    while ( (line = reader.readLine()) != null) {

      String[] tokens = TAB.split(line);

      long itemAID = Long.parseLong(tokens[0]);
      long itemBID = Long.parseLong(tokens[1]);
      double similarity = Double.parseDouble(tokens[2]);

      if (currentLine == 1) {
        assertEquals(1L, itemAID);
        assertEquals(3L, itemBID);
        assertEquals(0.45, similarity, 0.01);
      }

      if (currentLine == 2) {
        assertEquals(2L, itemAID);
        assertEquals(3L, itemBID);
        assertEquals(0.89, similarity, 0.01);
      }

      currentLine++;
    }

    int linesWritten = currentLine-1;
    assertEquals(2, linesWritten);
  }

  /**
   * integration test for the limitation of the number of computed similarities
   *
   * <pre>
   * user-item-matrix
   *
   *        i1  i2  i3
   *    u1   1   0   1
   *    u2   0   1   1
   *    u3   1   1   0
   *    u4   1   1   1
   *    u5   0   1   0
   *    u6   1   1   0
   *
   *    tanimoto(i1,i2) = 0.5
   *    tanimoto(i2,i3) = 0.333
   *     tanimoto(i3,i1) = 0.4
   *
   *    When we set maxSimilaritiesPerItem to 1 the following pairs should be found:
   *
   *    i1 --> i2
   *    i2 --> i1
   *    i3 --> i1
   * </pre>
   */
  @Test
  public void testMaxSimilaritiesPerItem() throws Exception {

    File inputFile = getTestTempFile("prefsForMaxSimilarities.txt");
    File outputDir = getTestTempDir("output");
    outputDir.delete();
    File tmpDir = getTestTempDir("tmp");

    writeLines(inputFile,
        "1,1,1",
        "1,3,1",
        "2,2,1",
        "2,3,1",
        "3,1,1",
        "3,2,1",
        "4,1,1",
        "4,2,1",
        "4,3,1",
        "5,2,1",
        "6,1,1",
        "6,2,1");

    ItemSimilarityJob similarityJob =  new ItemSimilarityJob();

    Configuration conf = new Configuration();
    conf.set("mapred.input.dir", inputFile.getAbsolutePath());
    conf.set("mapred.output.dir", outputDir.getAbsolutePath());
    conf.setBoolean("mapred.output.compress", false);

    similarityJob.setConf(conf);

    similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
        TanimotoCoefficientSimilarity.class.getName(), "--maxSimilaritiesPerItem", "1" });

    File outPart = outputDir.listFiles(new FilenameFilter() {
      @Override
      public boolean accept(File dir, String name) {
        return name.startsWith("part-");
      }
    })[0];
    BufferedReader reader = Files.newReader(outPart, Charsets.UTF_8);

    String line;
    int currentLine = 1;
    while ((line = reader.readLine()) != null) {

      String[] tokens = TAB.split(line);

      long itemAID = Long.parseLong(tokens[0]);
      long itemBID = Long.parseLong(tokens[1]);
      double similarity = Double.parseDouble(tokens[2]);

      if (currentLine == 1) {
        assertEquals(1L, itemAID);
        assertEquals(2L, itemBID);
        assertEquals(0.5, similarity, 0.0001);
      }

      if (currentLine == 2) {
        assertEquals(1L, itemAID);
        assertEquals(3L, itemBID);
        assertEquals(0.4, similarity, 0.0001);
      }

      currentLine++;
    }

    int linesWritten = currentLine - 1;
    assertEquals(2, linesWritten);
  }

}