package org.wikibrain.cookbook.phrases; import java.io.BufferedReader; import java.io.FileReader; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.HashSet; import java.util.Set; /** * Created with IntelliJ IDEA. * User: Matt Lesicko * Date: 7/2/13 * Time: 1:20 PM * To change this template use File | Settings | File Templates. */ public class CheckHashCollisions { private static Long hash(String title){ try { MessageDigest messageDigest = MessageDigest.getInstance("SHA-256"); messageDigest.update(title.getBytes()); byte[] bytes = messageDigest.digest(); long h = 1125899906842597L; for (byte b : bytes){ h=31*h+b; } return h; } catch (NoSuchAlgorithmException e) { e.printStackTrace(); return null; } } public static void main (String[] args) throws Exception { String file = "./enwiki-20130604-all-titles-in-ns0"; Set<Long> titleHashes = new HashSet<Long>(); BufferedReader titles = new BufferedReader(new FileReader (file)); String title = titles.readLine(); //Intentionally skipping the first line title = titles.readLine(); int i=0, j=0; while(title!=null){ Long hash = hash(title); assert(hash!=null); if (titleHashes.contains(hash)){ //throw new Exception("HASHING COLLISION"); j++; } titleHashes.add(hash); title=titles.readLine(); } System.out.println(titleHashes.size()+" entries, "+j+" collisions"); } }