/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.math.hadoop.decomposer; import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; import org.apache.mahout.math.DenseMatrix; import org.apache.mahout.math.Matrix; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.apache.mahout.math.hadoop.DistributedRowMatrix; import org.apache.mahout.math.hadoop.TestDistributedRowMatrix; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collection; import java.util.Arrays; public final class TestDistributedLanczosSolverCLI extends MahoutTestCase { private static final Logger log = LoggerFactory.getLogger(TestDistributedLanczosSolverCLI.class); @Test public void testDistributedLanczosSolverCLI() throws Exception { Path testData = getTestTempDirPath("testdata"); DistributedRowMatrix corpus = new TestDistributedRowMatrix().randomDenseHierarchicalDistributedMatrix(10, 9, false, testData.toString()); corpus.setConf(new Configuration()); Path output = getTestTempDirPath("output"); Path tmp = getTestTempDirPath("tmp"); Path workingDir = getTestTempDirPath("working"); String[] args = { "-i", new Path(testData, "distMatrix").toString(), "-o", output.toString(), "--tempDir", tmp.toString(), "--numRows", "10", "--numCols", "9", "--rank", "6", "--symmetric", "false", "--workingDir", workingDir.toString() }; new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args); output = getTestTempDirPath("output2"); tmp = getTestTempDirPath("tmp2"); args = new String[] { "-i", new Path(testData, "distMatrix").toString(), "-o", output.toString(), "--tempDir", tmp.toString(), "--numRows", "10", "--numCols", "9", "--rank", "7", "--symmetric", "false", "--workingDir", workingDir.toString() }; new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args); Path rawEigenvectors = new Path(output, DistributedLanczosSolver.RAW_EIGENVECTORS); Matrix eigenVectors = new DenseMatrix(7, corpus.numCols()); Configuration conf = new Configuration(); int i = 0; for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(rawEigenvectors, conf)) { Vector v = value.get(); eigenVectors.assignRow(i, v); i++; } assertEquals("number of eigenvectors", 7, i); } @Test public void testDistributedLanczosSolverEVJCLI() throws Exception { Path testData = getTestTempDirPath("testdata"); DistributedRowMatrix corpus = new TestDistributedRowMatrix() .randomDenseHierarchicalDistributedMatrix(10, 9, false, testData.toString()); corpus.setConf(new Configuration()); Path output = getTestTempDirPath("output"); Path tmp = getTestTempDirPath("tmp"); String[] args = { "-i", new Path(testData, "distMatrix").toString(), "-o", output.toString(), "--tempDir", tmp.toString(), "--numRows", "10", "--numCols", "9", "--rank", "6", "--symmetric", "false", "--cleansvd", "true" }; new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args); Path cleanEigenvectors = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS); Matrix eigenVectors = new DenseMatrix(6, corpus.numCols()); Collection<Double> eigenvalues = Lists.newArrayList(); output = getTestTempDirPath("output2"); tmp = getTestTempDirPath("tmp2"); args = new String[] { "-i", new Path(testData, "distMatrix").toString(), "-o", output.toString(), "--tempDir", tmp.toString(), "--numRows", "10", "--numCols", "9", "--rank", "7", "--symmetric", "false", "--cleansvd", "true" }; new DistributedLanczosSolver().new DistributedLanczosSolverJob().run(args); Path cleanEigenvectors2 = new Path(output, EigenVerificationJob.CLEAN_EIGENVECTORS); Matrix eigenVectors2 = new DenseMatrix(7, corpus.numCols()); Configuration conf = new Configuration(); Collection<Double> newEigenValues = Lists.newArrayList(); int i = 0; for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors, conf)) { NamedVector v = (NamedVector) value.get(); eigenVectors.assignRow(i, v); log.info(v.getName()); if(EigenVector.getCosAngleError(v.getName()) < 1.0e-3) { eigenvalues.add(EigenVector.getEigenValue(v.getName())); } i++; } assertEquals("number of clean eigenvectors", 3, i); i = 0; for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(cleanEigenvectors2, conf)) { NamedVector v = (NamedVector) value.get(); log.info(v.getName()); eigenVectors2.assignRow(i, v); newEigenValues.add(EigenVector.getEigenValue(v.getName())); i++; } Collection<Integer> oldEigensFound = Lists.newArrayList(); for(int row = 0; row < eigenVectors.numRows(); row++) { Vector oldEigen = eigenVectors.viewRow(row); if(oldEigen == null) { break; } for(int newRow = 0; newRow < eigenVectors2.numRows(); newRow++) { Vector newEigen = eigenVectors2.viewRow(newRow); if(newEigen != null) { if(oldEigen.dot(newEigen) > 0.9) { oldEigensFound.add(row); break; } } } } assertEquals("the number of new eigenvectors", 5, i); Collection<Double> oldEigenValuesNotFound = Lists.newArrayList(); for(double d : eigenvalues) { boolean found = false; for(double newD : newEigenValues) { if(Math.abs((d - newD)/d) < 0.1) { found = true; } } if(!found) { oldEigenValuesNotFound.add(d); } } assertEquals("number of old eigenvalues not found: " + Arrays.toString(oldEigenValuesNotFound.toArray(new Double[oldEigenValuesNotFound.size()])), 0, oldEigenValuesNotFound.size()); assertEquals("did not find enough old eigenvectors", 3, oldEigensFound.size()); } }