package com.cyc.tool.conceptfinder; /* * #%L * ConceptFinder * %% * Copyright (C) 2015 Cycorp, Inc * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.cyc.tool.distributedrepresentations.Word2VecSpace; import com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm; import com.cyc.tool.distributedrepresentations.Word2VecSubspace; import com.cyc.tool.owltools.OpenCycOwl; import java.io.IOException; import java.util.Comparator; import java.util.List; import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; /** * <P> * ConceptSpace provides access to a Word2VecSpace and methods for finding ConceptMatches. */ public class ConceptSpace { Word2VecSpace w2vSpace; /** * Creates a new instance of ConceptSpace. * * @param w2v * @throws java.io.IOException */ public ConceptSpace(Word2VecSpace w2v) throws IOException { w2vSpace = w2v; } /** * * @param terms * @param n * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNFor(List<String> terms, Integer n) throws NoWordToVecVectorForTerm { return findNearest(w2vSpace.getMaximalNormedVector(terms)) .stream() .collect(Collectors.toList()) .subList(0, n); } /** * * @param terms * @param n * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNFor(String terms, Integer n) throws NoWordToVecVectorForTerm { return findNearestNFor(w2vSpace.stringToList(terms), n); } /** * * @param terms * @param n * @param ocyc * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForIn(List<String> terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { float[] norm = w2vSpace.getMaximalNormedVector(terms); return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t))) .stream() .collect(Collectors.toList()) .subList(0, n); } /** * * @param terms * @param n * @param ocyc * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForIn(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { return findNearestNForIn(w2vSpace.stringToList(terms), n, ocyc); } /** * * @param terms * @param n * @param ocyc * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForInStrictW2V(List<String> terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { float[] norm = w2vSpace.getGoogleNormedVector(terms); return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t))) .stream() .collect(Collectors.toList()) .subList(0, n); } /** * * @param terms * @param n * @param ocyc * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForInStrictW2V(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { float[] norm = w2vSpace.getGoogleNormedVector(w2vSpace.stringToList(terms)); return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t))) .stream() .collect(Collectors.toList()) .subList(0, n); } /** * Find the position of terms in the larger space from which this is derived a larger space, and * then search around them in a this space that spans fewer terms, but is otherwise the same * * Will fail if the space for this concept space is not a SubSpace * * @param terms The string containing a set of terms to search around * @param n How many things to find in this space * @param note * @return a List of ConceptMatches * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForPosition(String terms, Integer n, Function<String, String> note) throws NoWordToVecVectorForTerm { return findNearestNForPosition(w2vSpace.stringToList(terms), n, note); } /** * Find the position of terms in the larger space from which this is derived a larger space, and * then search around them in a this space that spans fewer terms, but is otherwise the same * * Will fail if the space for this concept space is not a SubSpace * * @param terms The string containing a set of terms to search around * @param n How many things to find in this space * @param note * @return a List of ConceptMatches * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForPosition(List<String> terms, Integer n, Function<String, String> note) throws NoWordToVecVectorForTerm { Word2VecSpace posSpace = ((Word2VecSubspace) w2vSpace).getSuperSpace(); return findNearestNForPosition(terms, posSpace, n, note); } /** * Find the position of terms in a larger space, and then search around them in a space that spans * fewer terms, but is otherwise the same * * @param terms The string containing a set of terms to search around * @param posSpace The other larger space in which to search for those terms. * @param n How many things to find in this space * @param note * @return a List of ConceptMatches * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForPosition(String terms, Word2VecSpace posSpace, Integer n, Function<String, String> note) throws NoWordToVecVectorForTerm { return findNearestNForPosition(w2vSpace.stringToList(terms), posSpace, n, note); } /** * Find the position of terms in a larger space, and then search around them in a space that spans * fewer terms, but is otherwise the same * * @param terms The list of terms to search around * @param posSpace The other larger space in which to search for those terms. * @param n How many things to find in this space * @param note * @return a List of ConceptMatches * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForPosition(List<String> terms, Word2VecSpace posSpace, Integer n, Function<String, String> note) throws NoWordToVecVectorForTerm { return findNearest(posSpace.getMaximalNormedVector(terms), note) .stream() .collect(Collectors.toList()) .subList(0, n); } /** * * @param terms * @param n * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForStrictW2V(List<String> terms, Integer n) throws NoWordToVecVectorForTerm { return findNearest(w2vSpace.getGoogleNormedVector(terms)) .stream() .collect(Collectors.toList()) .subList(0, n); } /** * * @param terms * @param n * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForWithInputTermFiltering(List<String> terms, Integer n) throws NoWordToVecVectorForTerm { return findNearest(w2vSpace.getMaximalNormedVector(terms)) .stream() .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term .collect(Collectors.toList()) .subList(0, n); } /** * * @param terms * @param n * @return a List of ConceptMatches * @throws NoWordToVecVectorForTerm */ public List<ConceptMatch> findNearestNForWithInputTermFilteringStrictW2V(List<String> terms, Integer n) throws NoWordToVecVectorForTerm { return findNearest(w2vSpace.getGoogleNormedVector(terms)) .stream() .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term .collect(Collectors.toList()) .subList(0, n); } /** * * @return the w2vSpace */ public Word2VecSpace getW2VSpace() { return w2vSpace; } private List<ConceptMatch> findNearest(float[] searchVector, Function<String, String> note) { Comparator<Double> compareDouble = (Double m1, Double m2) -> Double.compare(m2, m1); Comparator<ConceptMatch> compareMatches = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity()); // This is a massive sort (3m elements) so it might be better to optimise // for top N return w2vSpace.getVectors().keySet().stream() .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note)) .sorted(compareMatches).collect(Collectors.toList()); } private List<ConceptMatch> findNearest(float[] searchVector) { return findNearest(searchVector, null); } private List<ConceptMatch> findNearestWhere(float[] searchVector, Predicate<String> pred, Function<String, String> note) { Comparator<ConceptMatch> compareMatches = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity()); // This is a massive sort (3m elements) so it might be better to optimise // for top N return w2vSpace.getVectors().keySet().parallelStream() .filter(pred) .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note)) .sorted(compareMatches).collect(Collectors.toList()); } }