/* * Seldon -- open source prediction engine * ======================================= * * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/) * * ******************************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ******************************************************************************************** */ package io.seldon.semvec; import java.util.Enumeration; import java.util.LinkedList; import java.util.Set; import java.util.logging.Logger; import pitt.search.semanticvectors.CompoundVectorBuilder; import pitt.search.semanticvectors.FlagConfig; import pitt.search.semanticvectors.LuceneUtils; import pitt.search.semanticvectors.ObjectVector; import pitt.search.semanticvectors.SearchResult; import pitt.search.semanticvectors.VectorStore; import pitt.search.semanticvectors.vectors.Vector; import pitt.search.semanticvectors.vectors.ZeroVectorException; abstract public class VectorStoreRecommender { private static final Logger logger = Logger.getLogger(VectorStoreRecommender.class.getCanonicalName()); private VectorStore queryVecStore; private VectorStore searchVecStore; private LuceneUtils luceneUtils; private Set<String> exclusions; private Set<String> inclusions; private String minDoc; /** * This needs to be filled in for each subclass. It takes an individual * vector and assigns it a relevance score for this VectorSearcher. */ public abstract double getScore(Vector testVector); public abstract double getScore(Vector v1,Vector v2); /** * Performs basic initialization; subclasses should normally call super() to use this. * @param queryVecStore Vector store to use for query generation. * @param searchVecStore The vector store to search. * @param luceneUtils LuceneUtils object to use for query weighting. (May be null.) */ public VectorStoreRecommender(VectorStore queryVecStore, VectorStore searchVecStore, LuceneUtils luceneUtils, Set<String> exclusions, Set<String> inclusions, String minDoc) { this.queryVecStore = queryVecStore; this.searchVecStore = searchVecStore; this.luceneUtils = luceneUtils; this.exclusions = exclusions; this.inclusions = inclusions; this.minDoc = minDoc; } /** * This nearest neighbor search is implemented in the abstract * VectorSearcher class itself: this enables all subclasses to reuse * the search whatever scoring method they implement. Since query * expressions are built into the VectorSearcher, * getNearestNeighbors no longer takes a query vector as an * argument. * @param numResults the number of results / length of the result list. */ public LinkedList<SearchResult> getNearestNeighbors(int numResults) { LinkedList<SearchResult> results = new LinkedList<>(); double score, threshold = -1; int duplicatesRemoved = 0; Enumeration<ObjectVector> vecEnum = searchVecStore.getAllVectors(); while (vecEnum.hasMoreElements()) { ObjectVector testElement = vecEnum.nextElement(); // ignore excluded items if (exclusions.contains(testElement.getObject().toString())) continue; //only allow includions if specified if (inclusions != null && inclusions.size()>0 && !inclusions.contains(testElement.getObject().toString())) continue; // ignore items greater than minDoc id (assume doc ids string ordering is useful) if (minDoc != null && testElement.getObject().toString().compareTo(minDoc) < 0) continue; // Initialize result list if just starting. if (results.size() == 0) { score = getScore(testElement.getVector()); results.add(new SearchResult(score, testElement)); continue; } // Test this element. score = getScore(testElement.getVector()); // This is a way of using the Lucene Index to get term and // document frequency information to reweight all results. It // seems to be good at moving excessively common terms further // down the results. Note that using this means that scores // returned are no longer just cosine similarities. if (this.luceneUtils != null) { score = score * luceneUtils.getGlobalTermWeightFromString((String) testElement.getObject()); } if (score > threshold) { boolean added = false; boolean duplicate = false; for (int i = 0; i < results.size() && !added && !duplicate; ++i) { SearchResult r = results.get(i); if (score == r.getScore()) { ObjectVector rVec = (ObjectVector) r.getObjectVector(); double overlap = getScore(rVec.getVector(),testElement.getVector()); double epsilon = Math.abs(overlap-1); if (epsilon < 0.000001) { duplicatesRemoved++; duplicate = true; continue; } } // Add to list if this is right place. if (score > r.getScore() && added == false) { results.add(i, new SearchResult(score, testElement)); added = true; } } // Prune list if there are already numResults. if (results.size() > numResults) { results.removeLast(); threshold = results.getLast().getScore(); } else { if (results.size() < numResults && !added && !duplicate) { results.add(new SearchResult(score, testElement)); } } } } if (duplicatesRemoved > 0) logger.info("removed "+duplicatesRemoved+" duplicates"); return results; } /** * Class for searching a vector store using cosine similarity. * Takes a sum of positive query terms and optionally negates some terms. */ static public class VectorStoreRecommenderCosine extends VectorStoreRecommender { Vector queryVector; /** * @param queryVecStore Vector store to use for query generation. * @param searchVecStore The vector store to search. * @param luceneUtils LuceneUtils object to use for query weighting. (May be null.) * @param queryTerms Terms that will be parsed into a query * expression. If the string "NOT" appears, terms after this will be negated. */ public VectorStoreRecommenderCosine(VectorStore queryVecStore, VectorStore searchVecStore, LuceneUtils luceneUtils, String[] queryTerms, Set<String> exclusions, Set<String> inclusions, String minDoc) throws ZeroVectorException { super(queryVecStore, searchVecStore, luceneUtils, exclusions,inclusions,minDoc); this.queryVector = CompoundVectorBuilder.getQueryVector(queryVecStore, luceneUtils, FlagConfig.getFlagConfig(null), queryTerms); if (this.queryVector.isZeroVector()) { throw new ZeroVectorException("Query vector is zero ... no results."); } } @Override public double getScore(Vector testVector) { //testVector = VectorUtils.getNormalizedVector(testVector); return this.queryVector.measureOverlap(testVector); } @Override public double getScore(Vector v1, Vector v2) { return v1.measureOverlap(v2); } } }