/*
* Seldon -- open source prediction engine
* =======================================
* Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
*
**********************************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************************************
*/
package io.seldon.stream.itemsim;
import io.seldon.stream.itemsim.MinHashCollections.State;
import io.seldon.stream.itemsim.minhash.Hasher;
import io.seldon.stream.itemsim.minhash.SimplePrimeHash;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class StreamingJaccardSimilarity {
MinHashCollections mhcs;
public StreamingJaccardSimilarity(int windowSizeSecs,int numHashes,int minActivity)
{
Set<Hasher> existing = new HashSet<>();
for(int i=0;i<numHashes;i++)
{
Hasher h = SimplePrimeHash.create(existing);
existing.add(h);
}
List<Hasher> hashes = new ArrayList<Hasher>(existing);
MinHasherFactory f = new RollingWindowedMinHashFactory(hashes);
this.mhcs = new MinHashCollections(f, windowSizeSecs, minActivity);
}
public void add(long itemId,long userId,long timeSecs)
{
mhcs.add(itemId, userId, timeSecs);
}
public List<JaccardSimilarity> getSimilarity(long timeSecs)
{
List<JaccardSimilarity> res = new ArrayList<>();
List<State> states = mhcs.getAllMinHashes(timeSecs);
for(State s1 : states)
for(State s2 : states)
{
if (s1.id < s2.id)
{
float jaccard = s1.jaccardEstimate(s2);
if (jaccard > 0)
res.add(new JaccardSimilarity(s1.id, s2.id, jaccard));
}
}
return res;
}
}