/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop.similarity.cooccurrence;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.hadoop.MathHelper;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.TanimotoCoefficientSimilarity;
import org.junit.Test;
import java.io.File;
public class RowSimilarityJobTest extends MahoutTestCase {
/**
* integration test with a tiny data set
*
* <pre>
*
* input matrix:
*
* 1, 0, 1, 1, 0
* 0, 0, 1, 1, 0
* 0, 0, 0, 0, 1
*
* similarity matrix (via tanimoto):
*
* 1, 0.666, 0
* 0.666, 1, 0
* 0, 0, 1
* </pre>
*/
@Test
public void toyIntegration() throws Exception {
File inputFile = getTestTempFile("rows");
File outputDir = getTestTempDir("output");
outputDir.delete();
File tmpDir = getTestTempDir("tmp");
Configuration conf = new Configuration();
Path inputPath = new Path(inputFile.getAbsolutePath());
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
MathHelper.writeDistributedRowMatrix(new double[][] {
new double[] { 1, 0, 1, 1, 0 },
new double[] { 0, 0, 1, 1, 0 },
new double[] { 0, 0, 0, 0, 1 } },
fs, conf, inputPath);
RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
rowSimilarityJob.setConf(conf);
rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
"--numberOfColumns", String.valueOf(5), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
"--tempDir", tmpDir.getAbsolutePath() });
Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
assertNotNull(similarityMatrix);
assertEquals(3, similarityMatrix.numCols());
assertEquals(3, similarityMatrix.numRows());
assertEquals(1.0, similarityMatrix.get(0, 0), EPSILON);
assertEquals(1.0, similarityMatrix.get(1, 1), EPSILON);
assertEquals(1.0, similarityMatrix.get(2, 2), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
assertEquals(0.666666, similarityMatrix.get(0, 1), EPSILON);
assertEquals(0.666666, similarityMatrix.get(1, 0), EPSILON);
}
@Test
public void toyIntegrationMaxSimilaritiesPerRow() throws Exception {
File inputFile = getTestTempFile("rows");
File outputDir = getTestTempDir("output");
outputDir.delete();
File tmpDir = getTestTempDir("tmp");
Configuration conf = new Configuration();
Path inputPath = new Path(inputFile.getAbsolutePath());
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
MathHelper.writeDistributedRowMatrix(new double[][]{
new double[] { 1, 0, 1, 1, 0, 1 },
new double[] { 0, 1, 1, 1, 1, 1 },
new double[] { 1, 1, 0, 1, 0, 0 } },
fs, conf, inputPath);
RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
rowSimilarityJob.setConf(conf);
rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
"--numberOfColumns", String.valueOf(6), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
"--maxSimilaritiesPerRow", String.valueOf(1), "--excludeSelfSimilarity", String.valueOf(true),
"--tempDir", tmpDir.getAbsolutePath() });
Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
assertNotNull(similarityMatrix);
assertEquals(3, similarityMatrix.numCols());
assertEquals(3, similarityMatrix.numRows());
assertEquals(0.0, similarityMatrix.get(0, 0), EPSILON);
assertEquals(0.5, similarityMatrix.get(0, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
assertEquals(0.5, similarityMatrix.get(1, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
assertEquals(0.4, similarityMatrix.get(2, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
}
@Test
public void toyIntegrationWithThreshold() throws Exception {
File inputFile = getTestTempFile("rows");
File outputDir = getTestTempDir("output");
outputDir.delete();
File tmpDir = getTestTempDir("tmp");
Configuration conf = new Configuration();
Path inputPath = new Path(inputFile.getAbsolutePath());
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
MathHelper.writeDistributedRowMatrix(new double[][]{
new double[] { 1, 0, 1, 1, 0, 1 },
new double[] { 0, 1, 1, 1, 1, 1 },
new double[] { 1, 1, 0, 1, 0, 0 } },
fs, conf, inputPath);
RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
rowSimilarityJob.setConf(conf);
rowSimilarityJob.run(new String[] { "--input", inputFile.getAbsolutePath(), "--output", outputDir.getAbsolutePath(),
"--numberOfColumns", String.valueOf(6), "--similarityClassname", TanimotoCoefficientSimilarity.class.getName(),
"--excludeSelfSimilarity", String.valueOf(true), "--threshold", String.valueOf(0.5),
"--tempDir", tmpDir.getAbsolutePath() });
Matrix similarityMatrix = MathHelper.readMatrix(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
assertNotNull(similarityMatrix);
assertEquals(3, similarityMatrix.numCols());
assertEquals(3, similarityMatrix.numRows());
assertEquals(0.0, similarityMatrix.get(0, 0), EPSILON);
assertEquals(0.5, similarityMatrix.get(0, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
assertEquals(0.5, similarityMatrix.get(1, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
}
}