/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.data;
import java.io.File;
import com.addthis.basis.util.LessFiles;
import com.addthis.maljson.JSONArray;
import com.clearspring.analytics.stream.membership.BloomFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* <h1>MakeBloom</h1>
* <p/>
* <p>Create a Bloom Filter out of a string list and export it.</p>
*
*/
public class MakeBloom {
/** */
private static final Logger log = LoggerFactory.getLogger(MakeBloom.class);
private static final String fpRate = System.getProperty("fpRate", "0.001");
private static final double fp_rate = Double.parseDouble(fpRate);
// get all the quoted words from a "frag" file
public static String[] getWords(File in) throws java.io.IOException,
com.addthis.maljson.JSONException {
log.debug("Reading " + in.length() + " bytes from [" + in + "]");
JSONArray words = new JSONArray("[" + new String(LessFiles.read(in), "utf8") + "]");
log.debug("Read " + words.length() + " words from [" + in + "]");
String[] ret = new String[words.length()];
for (int i = 0; i < words.length(); i++) {
ret[i] = words.getString(i);
}
return ret;
}
public static void main(String[] args) throws java.io.IOException,
com.addthis.maljson.JSONException {
if (args.length != 1 && args.length != 2) {
throw new IllegalArgumentException("usage: MakeBloom word-list-file [bloom-file]");
}
File in = new File(args[0]);
String[] words = getWords(in);
BloomFilter bf = new BloomFilter(words.length, fp_rate);
log.debug("Created: BloomFilter(" + bf.buckets() + " buckets, " +
bf.getHashCount() + " hashes); FP rate = " + fp_rate);
for (int i = 0; i < words.length; i++) {
bf.add(words[i]);
}
log.debug("Added words");
File out = args.length == 2 ? new File(args[1]) :
LessFiles.replaceSuffix(in, "-" + words.length + "-" + fpRate + ".bloom");
log.debug("Writing [" + out + "]");
LessFiles.write(out, org.apache.commons.codec.binary.Base64.encodeBase64(BloomFilter.serialize(bf)), false);
log.debug("Wrote " + out.length() + " bytes to [" + out + "]");
}
}