/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.cf.taste.hadoop.similarity.item;
import java.io.BufferedReader;
import java.io.File;
import java.io.FilenameFilter;
import java.util.Arrays;
import java.util.regex.Pattern;
import com.google.common.base.Charsets;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
import org.apache.mahout.cf.taste.impl.TasteTestCase;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.CosineSimilarity;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
import org.apache.mahout.math.map.OpenIntLongHashMap;
import org.easymock.EasyMock;
import org.junit.Test;
/**
* Unit tests for the mappers and reducers in org.apache.mahout.cf.taste.hadoop.similarity.item
* some integration tests with tiny data sets at the end
*/
public final class ItemSimilarityJobTest extends TasteTestCase {
private static final Pattern TAB = Pattern.compile("\t");
/**
* Tests {@link ItemSimilarityJob.MostSimilarItemPairsMapper}
*/
@Test
public void testMostSimilarItemsPairsMapper() throws Exception {
OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap();
indexItemIDMap.put(12, 12L);
indexItemIDMap.put(34, 34L);
indexItemIDMap.put(56, 56L);
Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable>.Context context =
EasyMock.createMock(Mapper.Context.class);
context.write(new EntityEntityWritable(34L, 56L), new DoubleWritable(0.9));
EasyMock.replay(context);
Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
vector.set(12, 0.2);
vector.set(56, 0.9);
ItemSimilarityJob.MostSimilarItemPairsMapper mapper = new ItemSimilarityJob.MostSimilarItemPairsMapper();
setField(mapper, "indexItemIDMap", indexItemIDMap);
setField(mapper, "maxSimilarItemsPerItem", 1);
mapper.map(new IntWritable(34), new VectorWritable(vector), context);
EasyMock.verify(context);
}
/**
* Tests {@link ItemSimilarityJob.MostSimilarItemPairsReducer}
*/
@Test
public void testMostSimilarItemPairsReducer() throws Exception {
Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable>.Context context =
EasyMock.createMock(Reducer.Context.class);
context.write(new EntityEntityWritable(123L, 456L), new DoubleWritable(0.5));
EasyMock.replay(context);
new ItemSimilarityJob.MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L, 456L),
Arrays.asList(new DoubleWritable(0.5), new DoubleWritable(0.5)), context);
EasyMock.verify(context);
}
/**
* Integration test with a tiny data set
*
* <pre>
* user-item-matrix
*
* Game Mouse PC Disk
* Jane - 1 2 -
* Paul 1 - 1 -
* Fred - - - 1
* </pre>
*/
@Test
public void testCompleteJob() throws Exception {
File inputFile = getTestTempFile("prefs.txt");
File outputDir = getTestTempDir("output");
outputDir.delete();
File tmpDir = getTestTempDir("tmp");
writeLines(inputFile,
"2,1,1",
"1,2,1",
"3,4,1",
"1,3,2",
"2,3,1");
ItemSimilarityJob similarityJob = new ItemSimilarityJob();
Configuration conf = new Configuration();
conf.set("mapred.input.dir", inputFile.getAbsolutePath());
conf.set("mapred.output.dir", outputDir.getAbsolutePath());
conf.setBoolean("mapred.output.compress", false);
similarityJob.setConf(conf);
similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
CosineSimilarity.class.getName() });
File outPart = outputDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith("part-");
}
})[0];
BufferedReader reader = Files.newReader(outPart, Charsets.UTF_8);
String line;
int currentLine = 1;
while ( (line = reader.readLine()) != null) {
String[] tokens = TAB.split(line);
long itemAID = Long.parseLong(tokens[0]);
long itemBID = Long.parseLong(tokens[1]);
double similarity = Double.parseDouble(tokens[2]);
if (currentLine == 1) {
assertEquals(1L, itemAID);
assertEquals(3L, itemBID);
assertEquals(0.45, similarity, 0.01);
}
if (currentLine == 2) {
assertEquals(2L, itemAID);
assertEquals(3L, itemBID);
assertEquals(0.89, similarity, 0.01);
}
currentLine++;
}
int linesWritten = currentLine-1;
assertEquals(2, linesWritten);
}
/**
* integration test for the limitation of the number of computed similarities
*
* <pre>
* user-item-matrix
*
* i1 i2 i3
* u1 1 0 1
* u2 0 1 1
* u3 1 1 0
* u4 1 1 1
* u5 0 1 0
* u6 1 1 0
*
* tanimoto(i1,i2) = 0.5
* tanimoto(i2,i3) = 0.333
* tanimoto(i3,i1) = 0.4
*
* When we set maxSimilaritiesPerItem to 1 the following pairs should be found:
*
* i1 --> i2
* i2 --> i1
* i3 --> i1
* </pre>
*/
@Test
public void testMaxSimilaritiesPerItem() throws Exception {
File inputFile = getTestTempFile("prefsForMaxSimilarities.txt");
File outputDir = getTestTempDir("output");
outputDir.delete();
File tmpDir = getTestTempDir("tmp");
writeLines(inputFile,
"1,1,1",
"1,3,1",
"2,2,1",
"2,3,1",
"3,1,1",
"3,2,1",
"4,1,1",
"4,2,1",
"4,3,1",
"5,2,1",
"6,1,1",
"6,2,1");
ItemSimilarityJob similarityJob = new ItemSimilarityJob();
Configuration conf = new Configuration();
conf.set("mapred.input.dir", inputFile.getAbsolutePath());
conf.set("mapred.output.dir", outputDir.getAbsolutePath());
conf.setBoolean("mapred.output.compress", false);
similarityJob.setConf(conf);
similarityJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
TanimotoCoefficientSimilarity.class.getName(), "--maxSimilaritiesPerItem", "1" });
File outPart = outputDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith("part-");
}
})[0];
BufferedReader reader = Files.newReader(outPart, Charsets.UTF_8);
String line;
int currentLine = 1;
while ((line = reader.readLine()) != null) {
String[] tokens = TAB.split(line);
long itemAID = Long.parseLong(tokens[0]);
long itemBID = Long.parseLong(tokens[1]);
double similarity = Double.parseDouble(tokens[2]);
if (currentLine == 1) {
assertEquals(1L, itemAID);
assertEquals(2L, itemBID);
assertEquals(0.5, similarity, 0.0001);
}
if (currentLine == 2) {
assertEquals(1L, itemAID);
assertEquals(3L, itemBID);
assertEquals(0.4, similarity, 0.0001);
}
currentLine++;
}
int linesWritten = currentLine - 1;
assertEquals(2, linesWritten);
}
}