package com.cyc.tool.conceptfinder;
/*
* #%L
* ConceptFinder
* %%
* Copyright (C) 2015 Cycorp, Inc
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.cyc.tool.distributedrepresentations.Word2VecSpace;
import com.cyc.tool.owltools.OpenCycOwl;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.mapdb.DB;
import org.mapdb.DBMaker;
import org.semanticweb.owlapi.model.OWLOntologyCreationException;
/**
* Methods for finding missing concepts with a ConceptSpace, a Word2VecSpace, and OpenCyc.
*/
abstract public class MissingConceptFinder {
final private ConceptSpace cSpace;
final private OpenCycOwl ocyc;
private final Word2VecSpace w2vs;
ConcurrentNavigableMap<Integer, List<ConceptMatch>> conceptsForMissingTerms;
DB db;
List<String[]> missingConceptNames;
List<String[]> missingMappingNames;
ConcurrentNavigableMap<Integer, String[]> missingTerms;
/**
* MissingConceptFinder constructor.
*
* @param w2v
* @param oco
* @throws IOException
* @throws OWLOntologyCreationException
*/
public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco) throws IOException, OWLOntologyCreationException {
this(w2v, oco, null);
}
/**
* MissingConceptFinder constructor.
*
* @param w2v
* @param oco
* @param cSpace
* @throws IOException
* @throws OWLOntologyCreationException
*/
public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco, ConceptSpace cSpace) throws IOException, OWLOntologyCreationException {
w2vs = w2v;
ocyc = oco;
this.cSpace = cSpace;
db = DBMaker.newFileDB(new File(ConceptFinderConfig.getMissingConceptDBFile()))
.closeOnJvmShutdown()
// .encryptionEnable("password")
.make();
//Use this to reset
// missingTerms.clear(); db.commit();
}
/**
*
* @return a List of Strings
*/
public List<String> conceptsWithTerms() {
return this.getConceptsForMissingTerms().keySet().stream()
.map(i -> Arrays.asList(getMissingTerms().get(i))
.stream()
.collect(Collectors.joining("|")))
.collect(Collectors.toList());
}
/**
* @return the conceptsForMissingTerms
*/
public ConcurrentNavigableMap<Integer, List<ConceptMatch>> getConceptsForMissingTerms() {
return conceptsForMissingTerms;
}
/**
* @param conceptsForMissingTerms the conceptsForMissingTerms to set
*/
public void setConceptsForMissingTerms(ConcurrentNavigableMap<Integer, List<ConceptMatch>> conceptsForMissingTerms) {
this.conceptsForMissingTerms = conceptsForMissingTerms;
}
/**
* @return the db
*/
public DB getDb() {
return db;
}
/**
* @return the missingConceptNames
*/
public List<String[]> getMissingConceptNames() {
return missingConceptNames;
}
/**
* @param missingConceptNames the missingConceptNames to set
*/
public void setMissingConceptNames(List<String[]> missingConceptNames) {
this.missingConceptNames = missingConceptNames;
}
/**
* @return the missingMappingNames
*/
public List<String[]> getMissingMappingNames() {
return missingMappingNames;
}
/**
* @param missingMappingNames the missingMappingNames to set
*/
public void setMissingMappingNames(List<String[]> missingMappingNames) {
this.missingMappingNames = missingMappingNames;
}
/**
*
* @return the missingTerms
*/
public ConcurrentNavigableMap<Integer, String[]> getMissingTerms() {
return missingTerms;
}
/**
* @param missingTerms the missingTerms to set
*/
public void setMissingTerms(ConcurrentNavigableMap<Integer, String[]> missingTerms) {
this.missingTerms = missingTerms;
}
/**
*
* @return the number of missing concepts
*/
public int missingConceptCount() {
return getMissingConceptNames().size();
}
/**
*
* @param testCase
* @return a Set of AttachmentHypotheses
*/
protected Set<AttachmentHypothesis> findNearbyTermsWithGraphCore(String testCase) {
return findNearbyTermsWithGraphCore(testCase, -1);
}
/**
*
* @param termStrings
* @param n
* @return a Set of AttachmentHypotheses
*/
protected Set<AttachmentHypothesis>
findNearbyTermsWithGraphCore(List<String> termStrings, int n) {
long t1 = System.currentTimeMillis();
Set<AttachmentHypothesis> hypotheses = new HashSet<>();
Set<String> allTypes = new HashSet<>();
Map<String, Double> typeWeights = new HashMap<>();
Map<String, Double> conceptEvidence = new HashMap<>();
System.out.print("====" + String.join("/", termStrings) + "====" + (n < 0 ? "" : " " + n) + " \t");
List<ConceptMatch> matches = new ArrayList<>();
for (String term : termStrings) {
try {
matches.addAll(cSpace.findNearestNForIn(term, 40, ocyc));
} catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
}
}
if (matches.size() == 0) {
// assertEquals("common_eiders", matches.get(10).term);
System.out.println("Terms [" + termStrings + "] have no words in Word2Vec");
return hypotheses; // which is empty at this point
// fail("took unexpected exception:" + ex);
}
IntStream.range(0, matches.size())
.forEach(i -> {
ConceptMatch m = matches.get(i);
//System.out.println(i + " " + m.toString());
if (m.concept != null) {
allTypes.add(m.concept);
typeWeights.put(m.concept,
(typeWeights.containsKey(m.concept) ? typeWeights.get(m.concept) : 0.0d)
+ m.similarity);
}
});
allTypes.forEach(s -> {
Double weight = typeWeights.get(s);
Set<String> transTypes = ocyc.getTypesTransitiveURL(s);
Set<String> immedTypes = ocyc.getTypesURL(s);
Set<String> ret
= Stream.concat(
transTypes
.stream()
.filter(type -> allTypes.contains(type)),
immedTypes.stream()
).collect(Collectors.toSet());
if (!ret.isEmpty()) {
ret.forEach(t -> {
if (!conceptEvidence.containsKey(t)) {
conceptEvidence.put(t, weight);
} else {
conceptEvidence.put(t, conceptEvidence.get(t) + weight);
}
});
}
});
final double max = conceptEvidence.entrySet().stream()
.mapToDouble(e -> e.getValue()).max().orElse(0);
Set<String> maxc = conceptEvidence.entrySet().stream()
.filter(e -> e.getValue() == max)
.map(e -> e.getKey()).collect(Collectors.toSet());
System.out.println("Maximum parent count:" + max);
System.out.println("Maximal parents:"
+ maxc.stream().map(s -> ocyc.labelsForConcept(s) + ": " + s)
.collect(Collectors.joining("\n\t")));
maxc.forEach(c -> hypotheses.add(new AttachmentHypothesis(n, termStrings,
c, max, ocyc.labelsForConcept(c))));
System.out.println("-----" + (System.currentTimeMillis() - t1) + "ms -----");
return hypotheses; // Since we take the max of a double, there should be only one
}
/**
*
* @param testCase
* @param n
* @return a Set of AttachmentHypotheses
* @deprecated
*/
@Deprecated
protected Set<AttachmentHypothesis> findNearbyTermsWithGraphCore(String testCase, int n) {
List<String> termStrings = new ArrayList<>();
termStrings.add(testCase);
return findNearbyTermsWithGraphCore(termStrings, n);
}
/**
*
* @return a List of names in the W2V space
* @deprecated
*/
@Deprecated //Depends on a variable that is only set in an initialisation phase
protected List<String> namesInW2V() {
if (getMissingMappingNames() == null) {
return null;
}
return getMissingMappingNames().stream()
.filter(hasElementInW2V())
.map(a -> a[0])
.collect(Collectors.toList());
}
Predicate<String[]> hasElementInW2V() {
return a -> Arrays.stream(a)
.anyMatch(w2vs::knownTerm);
}
}