/* * Seldon -- open source prediction engine * ======================================= * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/) * ********************************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ********************************************************************************************** */ package io.seldon.stream.itemsim; import io.seldon.stream.itemsim.MinHashCollections.State; import io.seldon.stream.itemsim.minhash.Hasher; import io.seldon.stream.itemsim.minhash.SimplePrimeHash; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; import org.junit.Assert; import org.junit.Test; public class MinHashCollectionsTest { @Test public void test() { Set<Hasher> existing = new HashSet<>(); for(int i=0;i<100;i++) { Hasher h = SimplePrimeHash.create(existing); existing.add(h); } List<Hasher> hashes = new ArrayList<Hasher>(existing); MinHasherFactory f = new RollingWindowedMinHashFactory(hashes); MinHashCollections mhcs = new MinHashCollections(f, 100, 0); for(int i=1;i<100;i++) { mhcs.add(1, i, i); if (i>50) mhcs.add(2, i, i); } List<State> states = mhcs.getAllMinHashes(10); for(State s1 : states) for(State s2 : states) { if (s1.id < s2.id) { float jaccard = s1.jaccardEstimate(s2); Assert.assertEquals(0.5, jaccard, 0.15); System.out.println(""+s1.id+"->"+s2.id+" "+jaccard); } } } @Test public void testMinActivity() { Set<Hasher> existing = new HashSet<>(); for(int i=0;i<100;i++) { Hasher h = SimplePrimeHash.create(existing); existing.add(h); } List<Hasher> hashes = new ArrayList<Hasher>(existing); MinHasherFactory f = new RollingWindowedMinHashFactory(hashes); MinHashCollections mhcs = new MinHashCollections(f, 100, 200); for(int i=1;i<100;i++) { mhcs.add(1, i, i); if (i>50) mhcs.add(2, i, i); } List<State> states = mhcs.getAllMinHashes(10); Assert.assertEquals(0, states.size()); } @Test public void speedTest() { Set<Hasher> existing = new HashSet<>(); for(int i=0;i<100;i++) { Hasher h = SimplePrimeHash.create(existing); existing.add(h); } List<Hasher> hashes = new ArrayList<Hasher>(existing); MinHasherFactory f = new RollingWindowedMinHashFactory(hashes); MinHashCollections mhcs = new MinHashCollections(f, 10000, 0); Random r = new Random(); long s1 = System.currentTimeMillis(); int timeSteps = 100000; for(int i=1;i<timeSteps;i++) { int itemId = r.nextInt(200); int userId = r.nextInt(999999); mhcs.add(itemId, userId, i); } long s2 = System.currentTimeMillis(); long t = s2-s1; double reqPerSec = timeSteps/(t/1000.0); System.out.println("time:"+t+" per sec "+reqPerSec); } }