/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.knn.similarity;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
public class CosineSimilarityUDFTest {
@Test
public void testEvaluate() throws IOException {
{
List<String> ftvec1 = Arrays.asList("bbb:1.4", "aaa:0.9", "ccc");
Assert.assertEquals(1.f, CosineSimilarityUDF.cosineSimilarity(ftvec1, ftvec1), 0.0);
}
Assert.assertEquals(
0.f,
CosineSimilarityUDF.cosineSimilarity(Arrays.asList("a", "b", "c"),
Arrays.asList("d", "e")), 0.0);
Assert.assertEquals(
0.f,
CosineSimilarityUDF.cosineSimilarity(Arrays.asList("a", "b", "c"),
Arrays.asList("d", "e")), 0.0);
Assert.assertEquals(1.f,
CosineSimilarityUDF.cosineSimilarity(Arrays.asList("a", "b"), Arrays.asList("a", "b")),
0.0);
Assert.assertEquals(0.5f,
CosineSimilarityUDF.cosineSimilarity(Arrays.asList("a", "b"), Arrays.asList("a", "c")),
0.0);
Assert.assertEquals(-1.f,
CosineSimilarityUDF.cosineSimilarity(Arrays.asList("a:1.0"), Arrays.asList("a:-1.0")),
0.0);
Assert.assertTrue(CosineSimilarityUDF.cosineSimilarity(Arrays.asList("apple", "orange"),
Arrays.asList("banana", "apple")) > 0.f);
Assert.assertTrue(CosineSimilarityUDF.cosineSimilarity(Arrays.asList("apple", "orange"),
Arrays.asList("banana", "apple")) > 0.f);
Assert.assertTrue((CosineSimilarityUDF.cosineSimilarity(Arrays.asList("apple", "orange"),
Arrays.asList("banana", "orange", "apple"))) > (CosineSimilarityUDF.cosineSimilarity(
Arrays.asList("apple", "orange"), Arrays.asList("banana", "orange"))));
Assert.assertEquals(
1.0f,
CosineSimilarityUDF.cosineSimilarity(
Arrays.asList("This is a sentence with seven tokens".split(" ")),
Arrays.<String>asList("This is a sentence with seven tokens".split(" "))), 0.0);
Assert.assertEquals(
1.0f,
CosineSimilarityUDF.cosineSimilarity(
Arrays.asList("This is a sentence with seven tokens".split(" ")),
Arrays.<String>asList("This is a sentence with seven tokens".split(" "))), 0.0);
{
List<String> tokens1 = Arrays.asList("1:1,2:1,3:1,4:1,5:0,6:1,7:1,8:1,9:0,10:1,11:1".split(","));
List<String> tokens2 = Arrays.asList("1:1,2:1,3:0,4:1,5:1,6:1,7:1,8:0,9:1,10:1,11:1".split(","));
Assert.assertEquals(0.77777f, CosineSimilarityUDF.cosineSimilarity(tokens1, tokens2),
0.00001f);
}
{
List<String> tokens1 = Arrays.asList("1 2 3 4 6 7 8 10 11".split("\\s+"));
List<String> tokens2 = Arrays.asList("1 2 4 5 6 7 9 10 11".split("\\s+"));
double dotp = 1 + 1 + 0 + 1 + 0 + 1 + 1 + 0 + 0 + 1 + 1;
double norm = Math.sqrt(tokens1.size()) * Math.sqrt(tokens2.size());
Assert.assertEquals(dotp / norm,
CosineSimilarityUDF.cosineSimilarity(tokens1, tokens2), 0.00001f);
Assert.assertEquals(dotp / norm,
CosineSimilarityUDF.cosineSimilarity(tokens1, tokens2), 0.00001f);
Assert.assertEquals(
dotp / norm,
CosineSimilarityUDF.cosineSimilarity(
Arrays.asList("1", "2", "3", "4", "6", "7", "8", "10", "11"),
Arrays.asList("1", "2", "4", "5", "6", "7", "9", "10", "11")), 0.00001f);
}
Assert.assertEquals(
0.f,
CosineSimilarityUDF.cosineSimilarity(Arrays.asList("1", "2", "3"),
Arrays.asList("4", "5")), 0.0);
Assert.assertEquals(1.f,
CosineSimilarityUDF.cosineSimilarity(Arrays.asList("1", "2"), Arrays.asList("1", "2")),
0.0);
}
}