/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop;
import com.google.common.base.Function;
import com.google.common.collect.Iterators;
import com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixSlice;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorIterable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.decomposer.SolverTest;
import org.junit.Test;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
public final class TestDistributedRowMatrix extends MahoutTestCase {
public static final String TEST_PROPERTY_KEY = "test.property.key";
public static final String TEST_PROPERTY_VALUE = "test.property.value";
private static void assertEquals(VectorIterable m, VectorIterable mtt, double errorTolerance) {
Iterator<MatrixSlice> mIt = m.iterateAll();
Iterator<MatrixSlice> mttIt = mtt.iterateAll();
Map<Integer, Vector> mMap = Maps.newHashMap();
Map<Integer, Vector> mttMap = Maps.newHashMap();
while (mIt.hasNext() && mttIt.hasNext()) {
MatrixSlice ms = mIt.next();
mMap.put(ms.index(), ms.vector());
MatrixSlice mtts = mttIt.next();
mttMap.put(mtts.index(), mtts.vector());
}
for(Map.Entry<Integer, Vector> entry : mMap.entrySet()) {
Integer key = entry.getKey();
Vector value = entry.getValue();
if(value == null || mttMap.get(key) == null) {
assertTrue(value == null || value.norm(2) == 0);
assertTrue(mttMap.get(key) == null || mttMap.get(key).norm(2) == 0);
} else {
assertTrue(
value.getDistanceSquared(mttMap.get(key)) < errorTolerance);
}
}
}
@Test
public void testTranspose() throws Exception {
DistributedRowMatrix m = randomDistributedMatrix(10, 9, 5, 4, 1.0, false);
DistributedRowMatrix mt = m.transpose();
Path tmpPath = getTestTempDirPath();
m.setOutputTempPathString(tmpPath.toString());
mt.setOutputTempPathString(new Path(tmpPath, "/tmpOutTranspose").toString());
DistributedRowMatrix mtt = mt.transpose();
assertEquals(m, mtt, EPSILON);
}
@Test
public void testMatrixTimesVector() throws Exception {
Vector v = new RandomAccessSparseVector(50);
v.assign(1.0);
Matrix m = SolverTest.randomSequentialAccessSparseMatrix(100, 90, 50, 20, 1.0);
DistributedRowMatrix dm = randomDistributedMatrix(100, 90, 50, 20, 1.0, false);
Vector expected = m.times(v);
Vector actual = dm.times(v);
assertEquals(0.0, expected.getDistanceSquared(actual), EPSILON);
}
@Test
public void testMatrixTimesSquaredVector() throws Exception {
Vector v = new RandomAccessSparseVector(50);
v.assign(1.0);
Matrix m = SolverTest.randomSequentialAccessSparseMatrix(100, 90, 50, 20, 1.0);
DistributedRowMatrix dm = randomDistributedMatrix(100, 90, 50, 20, 1.0, false);
Vector expected = m.timesSquared(v);
Vector actual = dm.timesSquared(v);
assertEquals(0.0, expected.getDistanceSquared(actual), 1.0e-9);
}
@Test
public void testMatrixTimesMatrix() throws Exception {
Matrix inputA = SolverTest.randomSequentialAccessSparseMatrix(20, 19, 15, 5, 10.0);
Matrix inputB = SolverTest.randomSequentialAccessSparseMatrix(20, 13, 25, 10, 5.0);
Matrix expected = inputA.transpose().times(inputB);
DistributedRowMatrix distA = randomDistributedMatrix(20, 19, 15, 5, 10.0, false, "distA");
DistributedRowMatrix distB = randomDistributedMatrix(20, 13, 25, 10, 5.0, false, "distB");
DistributedRowMatrix product = distA.times(distB);
assertEquals(expected, product, EPSILON);
}
@Test
public void testMatrixMultiplactionJobConfBuilder() throws Exception {
Configuration initialConf = createInitialConf();
Path baseTmpDirPath = getTestTempDirPath("testpaths");
Path aPath = new Path(baseTmpDirPath, "a");
Path bPath = new Path(baseTmpDirPath, "b");
Path outPath = new Path(baseTmpDirPath, "out");
Configuration mmJobConf = MatrixMultiplicationJob.createMatrixMultiplyJobConf(aPath, bPath, outPath, 10);
Configuration mmCustomJobConf = MatrixMultiplicationJob.createMatrixMultiplyJobConf(initialConf,
aPath,
bPath,
outPath,
10);
assertNull(mmJobConf.get(TEST_PROPERTY_KEY));
assertEquals(TEST_PROPERTY_VALUE, mmCustomJobConf.get(TEST_PROPERTY_KEY));
}
@Test
public void testTransposeJobConfBuilder() throws Exception {
Configuration initialConf = createInitialConf();
Path baseTmpDirPath = getTestTempDirPath("testpaths");
Path inputPath = new Path(baseTmpDirPath, "input");
Path outputPath = new Path(baseTmpDirPath, "output");
Configuration transposeJobConf = TransposeJob.buildTransposeJobConf(inputPath, outputPath, 10);
Configuration transposeCustomJobConf = TransposeJob.buildTransposeJobConf(initialConf, inputPath, outputPath, 10);
assertNull(transposeJobConf.get(TEST_PROPERTY_KEY));
assertEquals(TEST_PROPERTY_VALUE, transposeCustomJobConf.get(TEST_PROPERTY_KEY));
}
@Test public void testTimesSquaredJobConfBuilders() throws Exception {
Configuration initialConf = createInitialConf();
Path baseTmpDirPath = getTestTempDirPath("testpaths");
Path inputPath = new Path(baseTmpDirPath, "input");
Path outputPath = new Path(baseTmpDirPath, "output");
Vector v = new RandomAccessSparseVector(50);
v.assign(1.0);
Configuration timesSquaredJobConf1 = TimesSquaredJob.createTimesSquaredJobConf(v, inputPath, outputPath);
Configuration customTimesSquaredJobConf1 = TimesSquaredJob.createTimesSquaredJobConf(initialConf, v, inputPath, outputPath);
assertNull(timesSquaredJobConf1.get(TEST_PROPERTY_KEY));
assertEquals(TEST_PROPERTY_VALUE, customTimesSquaredJobConf1.get(TEST_PROPERTY_KEY));
Configuration timesJobConf = TimesSquaredJob.createTimesJobConf(v, 50, inputPath, outputPath);
Configuration customTimesJobConf = TimesSquaredJob.createTimesJobConf(initialConf, v, 50, inputPath, outputPath);
assertNull(timesJobConf.get(TEST_PROPERTY_KEY));
assertEquals(TEST_PROPERTY_VALUE, customTimesJobConf.get(TEST_PROPERTY_KEY));
Configuration timesSquaredJobConf2 = TimesSquaredJob.createTimesSquaredJobConf(v,
inputPath,
outputPath,
TimesSquaredJob.TimesSquaredMapper.class,
TimesSquaredJob.VectorSummingReducer.class);
Configuration customTimesSquaredJobConf2 = TimesSquaredJob.createTimesSquaredJobConf(initialConf,
v,
inputPath,
outputPath,
TimesSquaredJob.TimesSquaredMapper.class,
TimesSquaredJob.VectorSummingReducer.class);
assertNull(timesSquaredJobConf2.get(TEST_PROPERTY_KEY));
assertEquals(TEST_PROPERTY_VALUE, customTimesSquaredJobConf2.get(TEST_PROPERTY_KEY));
Configuration timesSquaredJobConf3 = TimesSquaredJob.createTimesSquaredJobConf(v,
50,
inputPath,
outputPath,
TimesSquaredJob.TimesSquaredMapper.class,
TimesSquaredJob.VectorSummingReducer.class);
Configuration customTimesSquaredJobConf3 = TimesSquaredJob.createTimesSquaredJobConf(initialConf,
v,
50,
inputPath,
outputPath,
TimesSquaredJob.TimesSquaredMapper.class,
TimesSquaredJob.VectorSummingReducer.class);
assertNull(timesSquaredJobConf3.get(TEST_PROPERTY_KEY));
assertEquals(TEST_PROPERTY_VALUE, customTimesSquaredJobConf3.get(TEST_PROPERTY_KEY));
}
@Test
public void testTimesVectorTempDirDeletion() throws Exception {
Configuration conf = new Configuration();
Vector v = new RandomAccessSparseVector(50);
v.assign(1.0);
DistributedRowMatrix dm = randomDistributedMatrix(100, 90, 50, 20, 1.0, false);
Path outputPath = dm.getOutputTempPath();
FileSystem fs = outputPath.getFileSystem(conf);
deleteContentsOfPath(conf, outputPath);
assertEquals(0, fs.listStatus(outputPath).length);
Vector result1 = dm.times(v);
assertEquals(0, fs.listStatus(outputPath).length);
deleteContentsOfPath(conf, outputPath);
assertEquals(0, fs.listStatus(outputPath).length);
conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
dm.setConf(conf);
Vector result2 = dm.times(v);
FileStatus[] outputStatuses = fs.listStatus(outputPath);
assertEquals(1, outputStatuses.length);
Path outputTempPath = outputStatuses[0].getPath();
Path inputVectorPath = new Path(outputTempPath, TimesSquaredJob.INPUT_VECTOR);
Path outputVectorPath = new Path(outputTempPath, TimesSquaredJob.OUTPUT_VECTOR_FILENAME);
assertEquals(1, fs.listStatus(inputVectorPath, PathFilters.logsCRCFilter()).length);
assertEquals(1, fs.listStatus(outputVectorPath, PathFilters.logsCRCFilter()).length);
assertEquals(0.0, result1.getDistanceSquared(result2), EPSILON);
}
@Test
public void testTimesSquaredVectorTempDirDeletion() throws Exception {
Configuration conf = new Configuration();
Vector v = new RandomAccessSparseVector(50);
v.assign(1.0);
DistributedRowMatrix dm = randomDistributedMatrix(100, 90, 50, 20, 1.0, false);
Path outputPath = dm.getOutputTempPath();
FileSystem fs = outputPath.getFileSystem(conf);
deleteContentsOfPath(conf, outputPath);
assertEquals(0, fs.listStatus(outputPath).length);
Vector result1 = dm.timesSquared(v);
assertEquals(0, fs.listStatus(outputPath).length);
deleteContentsOfPath(conf, outputPath);
assertEquals(0, fs.listStatus(outputPath).length);
conf.setBoolean(DistributedRowMatrix.KEEP_TEMP_FILES, true);
dm.setConf(conf);
Vector result2 = dm.timesSquared(v);
FileStatus[] outputStatuses = fs.listStatus(outputPath);
assertEquals(1, outputStatuses.length);
Path outputTempPath = outputStatuses[0].getPath();
Path inputVectorPath = new Path(outputTempPath, TimesSquaredJob.INPUT_VECTOR);
Path outputVectorPath = new Path(outputTempPath, TimesSquaredJob.OUTPUT_VECTOR_FILENAME);
assertEquals(1, fs.listStatus(inputVectorPath, PathFilters.logsCRCFilter()).length);
assertEquals(1, fs.listStatus(outputVectorPath, PathFilters.logsCRCFilter()).length);
assertEquals(0.0, result1.getDistanceSquared(result2), EPSILON);
}
public static Configuration createInitialConf() {
Configuration initialConf = new Configuration();
initialConf.set(TEST_PROPERTY_KEY, TEST_PROPERTY_VALUE);
return initialConf;
}
private static void deleteContentsOfPath(Configuration conf, Path path) throws Exception {
FileSystem fs = path.getFileSystem(conf);
FileStatus[] statuses = fs.listStatus(path);
for (FileStatus status : statuses) {
fs.delete(status.getPath(), true);
}
}
public DistributedRowMatrix randomDistributedMatrix(int numRows,
int nonNullRows,
int numCols,
int entriesPerRow,
double entryMean,
boolean isSymmetric) throws IOException {
return randomDistributedMatrix(numRows, nonNullRows, numCols, entriesPerRow, entryMean, isSymmetric, "testdata");
}
public DistributedRowMatrix randomDenseHierarchicalDistributedMatrix(int numRows,
int numCols,
boolean isSymmetric,
String baseTmpDirSuffix)
throws IOException {
Path baseTmpDirPath = getTestTempDirPath(baseTmpDirSuffix);
Matrix c = SolverTest.randomHierarchicalMatrix(numRows, numCols, isSymmetric);
return saveToFs(c, baseTmpDirPath);
}
public DistributedRowMatrix randomDistributedMatrix(int numRows,
int nonNullRows,
int numCols,
int entriesPerRow,
double entryMean,
boolean isSymmetric,
String baseTmpDirSuffix) throws IOException {
Path baseTmpDirPath = getTestTempDirPath(baseTmpDirSuffix);
Matrix c = SolverTest.randomSequentialAccessSparseMatrix(numRows, nonNullRows, numCols, entriesPerRow, entryMean);
if(isSymmetric) {
c = c.times(c.transpose());
}
return saveToFs(c, baseTmpDirPath);
}
private static DistributedRowMatrix saveToFs(final Matrix m, Path baseTmpDirPath) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
ClusteringTestUtils.writePointsToFile(new Iterable<VectorWritable>() {
@Override
public Iterator<VectorWritable> iterator() {
return Iterators.transform(m.iterator(), new Function<MatrixSlice,VectorWritable>() {
@Override
public VectorWritable apply(MatrixSlice input) {
return new VectorWritable(input.vector());
}
});
}
}, true, new Path(baseTmpDirPath, "distMatrix/part-00000"), fs, conf);
DistributedRowMatrix distMatrix = new DistributedRowMatrix(new Path(baseTmpDirPath, "distMatrix"),
new Path(baseTmpDirPath, "tmpOut"),
m.numRows(),
m.numCols());
distMatrix.setConf(new Configuration(conf));
return distMatrix;
}
}