package ch.unibe.scg.cc; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedHashSet; import javax.inject.Inject; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; import dk.brics.automaton.AutomatonMatcher; import dk.brics.automaton.RegExp; import dk.brics.automaton.RunAutomaton; class ShingleHasher implements Hasher { final static private long serialVersionUID = 1L; final private static int SHINGLE_LENGTH = 4; final private StandardHasher standardHasher; final private RunAutomaton shingleRegex = new RunAutomaton(new RegExp("[^\\ ]+\\ [^\\ ]+\\ [^\\ ]+\\ [^\\ ]+").toAutomaton()); @Inject ShingleHasher(StandardHasher md) { this.standardHasher = md; } Collection<String> shingles(String doc) throws CannotBeHashedException { Collection<String> ret = new ArrayList<>(); int start = 0; for (int i = 0; i < SHINGLE_LENGTH; i++) { AutomatonMatcher matcher = shingleRegex.newMatcher(doc, start, doc.length()); while (matcher.find()) { ret.add(matcher.group()); } start = doc.indexOf(' ', start + 1); if (start == -1) { throw new CannotBeHashedException(); } } return ret; } Iterable<ByteBuffer> hashedShingles(Collection<String> shingles) { // LinkedHashSet maintains order, but deletes duplicates. LinkedHashSet<ByteBuffer> hashed = Sets.newLinkedHashSetWithExpectedSize(shingles.size()); for (String shingle : shingles) { hashed.add(ByteBuffer.wrap(standardHasher.hash(shingle))); } return hashed; } /** * Use a quarter of all hashes. */ private byte[] sketchFromHashedShingles(Iterable<ByteBuffer> hashedShingles, String doc) { Preconditions.checkArgument(hashedShingles.iterator().hasNext(), "There was nothing to make a sketch from. Input:\n" + doc); final byte[] hash = new byte[standardHasher.md.getDigestLength()]; int mask = 0x7; // After the first shift, that'll give binary pattern 11. do { mask >>= 1; for (final ByteBuffer hashedShingleBuffer : hashedShingles) { final byte[] hashedShingle = hashedShingleBuffer.array(); if ((hashedShingle[0] & mask) != mask) { continue; } Utils.xor(hash, hashedShingle); } if (!isZero(hash)) { return hash; } } while (mask != 0); // In the last run, mask is zero. Zero must turn up a hash. throw new AssertionError("After mask was 0, there must be a hash. Input:\n" + doc); } @Override public byte[] hash(String doc) throws CannotBeHashedException { return sketchFromHashedShingles(hashedShingles(shingles(doc)), doc); } private boolean isZero(byte[] ary) { for (final byte element : ary) { if (element != 0) { return false; } } return true; } }