/* * Seldon -- open source prediction engine * ======================================= * * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/) * * ******************************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ******************************************************************************************** */ package io.seldon.clustering.recommender; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import io.seldon.semvec.DocumentIdTransform; import io.seldon.semvec.SemVectorResult; import io.seldon.semvec.StringTransform; import io.seldon.sv.SemanticVectorsManager; import io.seldon.sv.SemanticVectorsStore; import io.seldon.util.CollectionTools; import org.apache.log4j.Logger; /** * Map tags (could be Facebook likes) to clusters using a semantic vectors store that has all the article texts for * each user dimension. We find for all the tags which user_dim documents match the best and use these as the clusters. * @author rummble * */ public class TagToClusterPeer { private static Logger logger = Logger.getLogger(TagToClusterPeer.class.getName()); private static final int NUM_CLUSTERS_PER_SEARCH = 10; private static final int NUM_CLUSTERS_RETURNED = 10; // if the results are bad they will effect recommendations for other users as this user's views will begin to be // seen in the clusters they are assigned. Thus is safer to give a low maximum weight. private static final double MAX_WEIGHT = 0.5; private String client; public TagToClusterPeer(String client) { this.client = client; } private String getSearchTerm(String tag) { if (tag != null) { String parts[] = tag.toLowerCase().trim().split("\\s+"); if (parts.length > 3) return null; else { StringBuffer b = new StringBuffer(); for(int i=0;i<parts.length;i++) if (i>0) b.append("_").append(parts[i]); else b.append(parts[i]); return b.toString(); } } else return null; } public List<UserCluster> suggestClusters(long userId,Set<String> tags) { List<UserCluster> clusters = new ArrayList<>(); //TODO code needs to be Springified so static method below can be removed SemanticVectorsStore sem = null; // sem = SemanticVectorsManager.getManager().getStore(client,SemanticVectorsManager.SV_CLUSTER_NEW_LOC_PATTERN,ctxt); if (sem != null) { Map<Long,Double> scores = new HashMap<>(); for(String tag : tags) { logger.info("For user "+userId+" checking ["+tag+"]"); String searchTerm = getSearchTerm(tag); if (searchTerm != null) { logger.info("Will search with term "+searchTerm); ArrayList<SemVectorResult<Long>> results = new ArrayList<>(); sem.searchDocsUsingTermQuery(searchTerm, results, new DocumentIdTransform(),new StringTransform(),NUM_CLUSTERS_PER_SEARCH); for(SemVectorResult<Long> r : results) { logger.info("Searching ["+searchTerm+"] dim_id:"+r.getResult()+"score:"+r.getScore()); if (r.getScore() > 0.5) { logger.info("Adding dim_id "+r.getResult()+" with score "+r.getScore()); if (scores.containsKey(r.getResult())) scores.put(r.getResult(), scores.get(r.getResult()) + r.getScore()); else scores.put(r.getResult(), r.getScore()); } } } } if (scores.size() > 0) { List<Long> best = CollectionTools.sortMapAndLimitToList(scores, NUM_CLUSTERS_RETURNED); double bestScore = scores.get(best.get(0)); for(Long id : best) { logger.info("Choosing dim_id:"+id+" with score "+scores.get(id)); double weight = MAX_WEIGHT * (scores.get(id)/bestScore); UserCluster cluster = new UserCluster(userId,id.intValue(),weight,0L,0); clusters.add(cluster); } } } return clusters; } }