package com.cyc.tool.conceptfinder;
/*
* #%L
* ConceptFinder
* %%
* Copyright (C) 2015 Cycorp, Inc
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
import com.cyc.tool.distributedrepresentations.Word2VecSpace;
import com.cyc.tool.owltools.OpenCycOwl;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.junit.AfterClass;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import org.junit.BeforeClass;
import org.junit.Test;
import org.semanticweb.owlapi.model.OWLOntologyCreationException;
/**
* MissingConceptFinder tests.
*/
public class MissingConceptFinderIT {
static ConceptSpace cSpace;
static List<String> cr = Arrays.asList("Chinese", "river");
static MissingConceptFinder mcf;
static Word2VecSpace mySpace;
static OpenCycOwl ocyc;
static List<String> pelagicBird = Arrays.asList("pelagic", "bird");
public MissingConceptFinderIT() {
}
@BeforeClass
public static void setUpClass() throws IOException, OWLOntologyCreationException {
mySpace = GoogleNewsW2VSpace.get();
cSpace = new ConceptSpace(mySpace);
ocyc = new OpenCycOwl();
mcf = new MissingConceptFinderDefault(mySpace, ocyc, cSpace);
}
@AfterClass
public static void tearDownClass() {
mySpace = null;
ocyc.close();
}
private static String set2String(Set<Integer> s) {
if (s.size()>10) return "";
return s.stream()
.map(i->{return String.join(",", mcf.getMissingTerms().get(i));})
.collect(Collectors.joining(";"));
}
@Test
public void conceptsWithTermsTest() {
List<String> res = mcf.conceptsWithTerms();
System.out.println("There are " + res.size() + " missing concepts with associated KB terms: " + res);
assertTrue(res.size() + "elements expected none", res.size() == 0);
// assertTrue(res.containsAll(Arrays.asList("start", "rust", "blueberry")));
}
@Test
public void findNearbyTerms1() {
long t1 = System.currentTimeMillis();
System.out.println("FNT1");
List<ConceptMatch> matches;
try {
matches = cSpace.findNearestNForIn(cr, 40, ocyc);
IntStream.range(0, matches.size())
.forEach(i -> {
System.out.println(i + " " + matches.get(i).toString());
});
System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
assertEquals("Chinese", matches.get(0).term);
} catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
fail("took unexpected exception:" + ex);
}
}
@Test
public void findNearbyTerms2() {
try {
long t1 = System.currentTimeMillis();
System.out.println("FNT2");
List<ConceptMatch> matches = cSpace.findNearestNForIn(cr, 40, ocyc);
IntStream.range(0, matches.size())
.forEach(i -> {
System.out.println(i + " " + matches.get(i).toString());
});
System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
assertEquals(0.5539201713461387, matches.get(13).similarity, 0.000001);
} catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
fail("took unexpected exception:" + ex);
}
}
@Test
public void findNearbyTerms3() {
try {
long t1 = System.currentTimeMillis();
System.out.println("FNT3");
List<ConceptMatch> matches = cSpace.findNearestNForIn(cr, 40, ocyc);
IntStream.range(0, matches.size())
.forEach(i -> {
System.out.println(i + " " + matches.get(i).toString());
});
System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
assertEquals("creek", matches.get(7).term);
} catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
fail("took unexpected exception:" + ex);
}
}
@Test
public void findNearbyTerms4() {
try {
long t1 = System.currentTimeMillis();
System.out.println("FNT4");
List<ConceptMatch> matches = cSpace.findNearestNForIn(cr, 40, ocyc);
IntStream.range(0, matches.size())
.forEach(i -> {
System.out.println(i + " " + matches.get(i).toString());
});
System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
assertEquals("riverbank", matches.get(12).term);
} catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
fail("took unexpected exception:" + ex);
}
}
@Test
public void findNearbyTermsWithGraphListTest() {
System.out.println("FNT WG 3");
IntStream.rangeClosed(3, 6)
.forEach(ti -> {
Arrays.asList(mcf.getMissingTerms().get(ti))
.forEach((String ss) -> {
mcf.findNearbyTermsWithGraphCore(ss, ti);
});
});
assertTrue(true);
}
@Test
public void findNearbyTermsWithGraphTest1() {
System.out.println("FNT WG 1");
mcf.findNearbyTermsWithGraphCore("pelagic bird");
assertTrue(true);
}
@Test
public void findNearbyTermsWithGraphTest2(){
System.out.println("FNT WG 2");
mcf.findNearbyTermsWithGraphCore("tobacco shop");
assertTrue(true);
}
@Test
public void findNearbyTermsWithGraphTest3() {
System.out.println("FNT WG 3");
mcf.findNearbyTermsWithGraphCore("pelagic bird");
mcf.findNearbyTermsWithGraphCore("tobacco shop");
mcf.findNearbyTermsWithGraphCore("net melon");
mcf.findNearbyTermsWithGraphCore("glowworm");
mcf.findNearbyTermsWithGraphCore("tightrope walking");
mcf.findNearbyTermsWithGraphCore("Adelie penguin");
assertTrue(true);
}
@Test
public void findNearbyTermsWithGraphTest4() {
System.out.println("FNT WG 4");
Set<AttachmentHypothesis> hyp = mcf.findNearbyTermsWithGraphCore("Adelie penguin");
System.out.println("HYP" + hyp);
assertEquals(1, hyp.size());
}
@Test
public void findSomeMissingTerms1() {
IntStream.rangeClosed(0, 3)
.forEach(ti -> {
Arrays.asList(mcf.getMissingTerms().get(ti))
.forEach((String ss) -> {
lookItUpWithOcyc(ss);
});
});
assertTrue(true);
}
@Test
public void findSomeMissingTerms2() {
IntStream.of(1, 5, 7)
//See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit
.forEach(ti -> {
Arrays.asList(mcf.getMissingTerms().get(ti))
.forEach((String ss) -> {
lookItUpWithOcyc(ss);
});
});
assertTrue(true);
}
@Test
public void findSomeMissingTerms3() {
IntStream.of(2, 3, 6)
//See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit
.forEach(ti -> {
Arrays.asList(mcf.getMissingTerms().get(ti))
.forEach((String ss) -> {
lookItUpAllW2V(ss);
});
});
assertTrue(true);
}
@Test
public void howManyMissingTermsInW2V() throws IOException {
final Set<Integer> found = new HashSet<>();
final Set<Integer> foundSpace = new HashSet<>();
final Set<Integer> unfound = new HashSet<>();
mcf.getMissingTerms().keySet().forEach(i -> {
Arrays.asList(mcf.getMissingTerms().get(i))
.forEach((String ss) -> {
if (mySpace.knownTerm(ss)) {
found.add(i);
if (ss.contains(" ")) {
foundSpace.add(i);
}
} else {
unfound.add(i);
}
});
});
System.out.println("Found directly in W2V : " + found.size()+" "+set2String(found));
System.out.println("Found directly in W2V with space: " + foundSpace.size()+" "+set2String(foundSpace));
System.out.println("Not found in W2V : " + unfound.size()+" "+set2String(unfound));
assertEquals(2, foundSpace.size());
assertEquals(8, unfound.size());
}
@Test
public void listSomeTest() {
IntStream.rangeClosed(0, 8)
.forEach(i -> {
System.out.println(i + ":\t" + String.join(", ",
Arrays.asList(mcf.getMissingTerms().get(i))));
});
assertTrue(true);
}
// @Test
// public void namesInW2VTest() {
// List<String> res;
// res = mcf.namesInW2V();
// assertEquals(12343, res.size());
// }
@Test
public void missingConceptCountTest() {
assertEquals(9, mcf.missingConceptCount());
}
private void lookItUpAllW2V(String ss) {
try {
System.out.println("=======[" + ss + "]=======");
long t1 = System.currentTimeMillis();
List<ConceptMatch> matches
= cSpace.findNearestNFor(Arrays.asList(ss.split("\\s+")), 40);
System.out.println("Matches:" + (matches == null ? "null" : matches.size()));
IntStream.range(0, matches.size())
.forEach(i -> {
String matchTerm = matches.get(i).term;
String mat = matches.get(i).toString();
if (ocyc.knownTerm(matchTerm)) {
// System.out.println("Known:" +matchTerm);
// System.out.println("Match is: "+ocyc.conceptsFor(matchTerm));
mat = mat.replace("---",
String.join(" | ", ocyc.conceptsFor(matchTerm)));
}
System.out.println(i + " " + mat);
});
System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
} catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
System.out.println("--- position not known in word to vec space:[" + ss + "]");
// Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex);
}
}
private void lookItUpWithOcyc(String ss) {
try {
System.out.println("=======[" + ss + "]=======");
long t1 = System.currentTimeMillis();
List<ConceptMatch> matches
= cSpace.findNearestNForIn(Arrays.asList(ss.split("\\s+")), 40, ocyc);
System.out.println("Matches:" + (matches == null ? "null" : matches.size()));
IntStream.range(0, matches.size())
.forEach(i -> {
System.out.println(i + " " + matches.get(i).toString());
});
System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
} catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
System.out.println("--- position not known in word to vec space:[" + ss + "]");
// Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex);
}
}
}