/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; import org.junit.Before; import org.junit.Test; public final class VectorHelperTest extends MahoutTestCase { private static final int NUM_DOCS = 100; private Path inputPathOne; private Path inputPathTwo; private Configuration conf; @Override @Before public void setUp() throws Exception { super.setUp(); conf = getConfiguration(); inputPathOne = getTestTempFilePath("documents/docs-one.file"); FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf); try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class)) { Random rd = RandomUtils.getRandom(); for (int i = 0; i < NUM_DOCS; i++) { // Make all indices higher than dictionary size writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS))); } } inputPathTwo = getTestTempFilePath("documents/docs-two.file"); fs = FileSystem.get(inputPathTwo.toUri(), conf); try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class)) { Random rd = RandomUtils.getRandom(); for (int i = 0; i < NUM_DOCS; i++) { // Keep indices within number of documents writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS))); } } } @Test public void testJsonFormatting() throws Exception { Vector v = new SequentialAccessSparseVector(10); v.set(2, 3.1); v.set(4, 1.0); v.set(6, 8.1); v.set(7, -100); v.set(9, 12.2); String UNUSED = "UNUSED"; String[] dictionary = { UNUSED, UNUSED, "two", UNUSED, "four", UNUSED, "six", "seven", UNUSED, "nine" }; assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1}", VectorHelper.vectorToJson(v, dictionary, 3, true)); assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}", VectorHelper.vectorToJson(v, dictionary, 2, false)); assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0}", VectorHelper.vectorToJson(v, dictionary, 4, true)); assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1,two:3.1,four:1.0,seven:-100.0}", VectorHelper.vectorToJson(v, dictionary, 5, true)); assertEquals("sorted json form incorrect: ", "{nine:12.2,six:8.1}", VectorHelper.vectorToJson(v, dictionary, 2, true)); assertEquals("unsorted form incorrect: ", "{two:3.1,four:1.0}", VectorHelper.vectorToJson(v, dictionary, 2, false)); } @Test public void testTopEntries() throws Exception { Vector v = new SequentialAccessSparseVector(10); v.set(2, 3.1); v.set(4, 1.0); v.set(6, 8.1); v.set(7, -100); v.set(9, 12.2); v.set(1, 0.0); v.set(3, 0.0); v.set(8, 2.7); // check if sizeOFNonZeroElementsInVector = maxEntries assertEquals(6, VectorHelper.topEntries(v, 6).size()); // check if sizeOfNonZeroElementsInVector < maxEntries assertTrue(VectorHelper.topEntries(v, 9).size() < 9); // check if sizeOfNonZeroElementsInVector > maxEntries assertTrue(VectorHelper.topEntries(v, 5).size() < v.getNumNonZeroElements()); } @Test public void testTopEntriesWhenAllZeros() throws Exception { Vector v = new SequentialAccessSparseVector(10); v.set(2, 0.0); v.set(4, 0.0); v.set(6, 0.0); v.set(7, 0); v.set(9, 0.0); v.set(1, 0.0); v.set(3, 0.0); v.set(8, 0.0); assertEquals(0, VectorHelper.topEntries(v, 6).size()); } @Test public void testLoadTermDictionary() throws Exception { // With indices higher than dictionary size VectorHelper.loadTermDictionary(conf, inputPathOne.toString()); // With dictionary size higher than indices VectorHelper.loadTermDictionary(conf, inputPathTwo.toString()); } }