/* * Seldon -- open source prediction engine * ======================================= * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/) * ********************************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ********************************************************************************************** */ package io.seldon.stream.itemsim; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ConcurrentHashMap; public class MinHashCollections { ConcurrentHashMap<Long,MinHashCollection> mhcs = new ConcurrentHashMap<>(); MinHasherFactory mhFactory; long window; int minActivity; public MinHashCollections(MinHasherFactory mhFactory,long window,int minActivity) { this.mhFactory = mhFactory; this.window = window; this.minActivity = minActivity; } public void add(long item,long user,long time) { if (!mhcs.containsKey(item)) mhcs.putIfAbsent(item, new MinHashCollection(mhFactory.create(window))); MinHashCollection mhc = mhcs.get(item); mhc.add(user, time); } public List<State> getAllMinHashes(long time) { List<State> states = new ArrayList<>(); for(Long id : mhcs.keySet()) { int count = mhcs.get(id).getCount(time); if (count >= minActivity) { List<Long> mh = mhcs.get(id).getMinHashes(time); if (mh != null && mh.size() > 0) states.add(new State(id,mh)); } else if (count == 0) { mhcs.remove(id); //System.out.println("removed "+id); } } System.out.println("Raw number of minHashes "+mhcs.size()+" but will return "+states.size()+" with minActiviy filter at "+minActivity); return states; } public static class State { long id; List<Long> minHashes; public State(long id, List<Long> minHashes) { super(); this.id = id; this.minHashes = minHashes; } public float jaccardEstimate(State other) { int overlap = 0; for(int i=0;i<minHashes.size();i++) { if (minHashes.get(i).equals(other.minHashes.get(i))) overlap++; } return overlap/(float)minHashes.size(); } } }