/**
* Copyright (C) 2013 Isabel Drost-Fromm
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.isabeldrostfromm.sof.naive;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.mahout.math.Vector;
import org.junit.Test;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import com.google.common.collect.Sets;
import de.isabeldrostfromm.sof.naive.Document;
import de.isabeldrostfromm.sof.naive.Vectoriser;
public class VectoriserTest extends RandomizedTest {
@Test
public void testBodyVectorisation() {
Vectoriser vectorise = new Vectoriser();
Document doc = Document.of("first", "", "", 0.0, new HashSet<String>());
Vector vec = vectorise.vectorise(doc);
assertEquals("Adding one term should result in two dimensions set to one.",
2,
vec.getNumNondefaultElements());
}
@Test
public void testBodySingleWord() {
Vectoriser vectorise = new Vectoriser();
Document doc = Document.of("first", "", "", 0.0, new HashSet<String>());
Vector first = vectorise.vectorise(doc);
Vector second = vectorise.vectorise(doc);
assertEquals("Adding docs with same content should result in same vector.",
first,
second);
}
@Test
public void testBodySingleDifferentWord() {
Vectoriser vectorise = new Vectoriser();
Document firstDoc = Document.of("first", "", "", 0.0, new HashSet<String>());
Document secondDoc = Document.of("second", "", "", 0.0, new HashSet<String>());
Vector first = vectorise.vectorise(firstDoc);
Vector second = vectorise.vectorise(secondDoc);
assertNotEquals("Adding docs with same content should result in same vector.",
first,
second);
}
@Test
public void testBodyVectorisation2Terms() {
Vectoriser vectorise = new Vectoriser();
Document doc = Document.of("first second", "", "", 0.0, new HashSet<String>());
Vector vec = vectorise.vectorise(doc);
assertEquals("Adding one term should result in two dimensions set to one.",
4,
vec.getNumNondefaultElements());
}
@Repeat(iterations = 10)
@Test
public void testBodyUsage() throws IOException {
String firstBody = randomText(10, 2000, 2, 100);
String secondBody = randomText(10, 2000, 2, 100);;
while (firstBody.equals(secondBody)) {
secondBody = randomText(10, 2000, 2, 100);;
}
String title = randomText(10, 2000, 2, 10);;
Set<String> tags = Sets.newHashSet(randomText(10, 2000, 1, 1));
double reputation = randomDouble();
Document firstDoc = Document.of(firstBody, "", title, reputation, tags);
Document secondDoc = Document.of(secondBody, "", title, reputation, tags);
Vectoriser vectorise = new Vectoriser();
Vector first = vectorise.vectorise(firstDoc);
Vector second = vectorise.vectorise(secondDoc);
assertNotEquals("Documents with different body should have different vectors.",
first,
second);
}
private String randomText(int minTokenLength, int maxTokenLength, int minTokens, int maxTokens) {
StringBuffer result = new StringBuffer();
int tokens = randomIntBetween(minTokens, maxTokens);
for (int i = 0; i < tokens; i++) {
result.append(randomAsciiOfLengthBetween(minTokenLength, maxTokenLength));
result.append(" ");
}
return result.toString();
}
@Repeat(iterations = 10)
@Test
public void testNoTargetLeakage() {
String body = randomText(10, 2000, 2, 100);;
String title = randomText(10, 2000, 2, 100);;
Set<String> tags = Sets.newHashSet(randomText(10, 2000, 1, 1));
double reputation = randomDouble();
Document firstDoc = Document.of(body, "first", title, reputation, tags);
Document secondDoc = Document.of(body, "second", title, reputation, tags);
Vectoriser vectorise = new Vectoriser();
Vector first = vectorise.vectorise(firstDoc);
Vector second = vectorise.vectorise(secondDoc);
assertEquals("The state field should not be taken into consideration when creating vectors.",
first,
second);
}
}