package edu.isi.karma.rdf.bloom; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Field; import java.util.BitSet; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.hash.Hash; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import com.uwyn.jhighlight.tools.FileUtils; import edu.isi.karma.er.helper.BloomFilterTripleStoreUtil; import edu.isi.karma.kr2rml.writer.KR2RMLBloomFilter; import edu.isi.karma.rdf.CommandLineArgumentParser; import edu.isi.karma.webserver.KarmaException; public class CombineBloomFiltersFromRDF { static String filepath; static String triplestoreURL; static String context; static String predicateURI = "http://isi.edu/integration/karma/dev#hasBloomFilter"; public static void main(String[] args) throws IOException, KarmaException { Options options = createCommandLineOptions(); CommandLine cl = CommandLineArgumentParser.parse(args, options, CombineBloomFiltersFromRDF.class.getSimpleName()); if(cl == null) { return; } filepath = (String) cl.getOptionValue("filepath"); triplestoreURL = (String) cl.getOptionValue("triplestoreurl"); context = (String) cl.getOptionValue("context"); if (filepath == null || triplestoreURL == null || context == null) return; File file = new File(filepath); Map<String, BloomFilterWorker> workers = new HashMap<>(); Map<String, KR2RMLBloomFilter> bfs = new HashMap<>(); long start = System.currentTimeMillis(); if (file.isDirectory()) { File[] files = file.listFiles(); for (File f : files) { if (FileUtils.getExtension(f.getName()) != null) { Model model = ModelFactory.createDefaultModel(); InputStream s = new FileInputStream(f); model.read(s, null, "TURTLE"); StmtIterator iterator = model.listStatements(); while(iterator.hasNext()) { Statement st = iterator.next(); String subject = st.getSubject().toString(); String object = st.getObject().toString(); String predicate = st.getPredicate().toString(); if (predicate.contains("hasBloomFilter")) { //predicateURI = predicate; BloomFilterWorker worker = workers.get(subject); if (worker == null) { worker = new BloomFilterWorker(); Thread t = new Thread(worker); t.start(); } worker.addBloomfilters(object); workers.put(subject, worker); } } } } for (Entry<String, BloomFilterWorker> entry : workers.entrySet()) { entry.getValue().setDone(); } for (Entry<String, BloomFilterWorker> entry : workers.entrySet()) { while(!entry.getValue().isFinished()); bfs.put(entry.getKey(), entry.getValue().getKR2RMLBloomFilter()); } BloomFilterTripleStoreUtil utilObj = new BloomFilterTripleStoreUtil(); Set<String> triplemaps = bfs.keySet(); Map<String, String> bloomfilterMapping = new HashMap<>(); bloomfilterMapping.putAll(utilObj.getBloomFiltersForMaps(triplestoreURL, context, triplemaps)); utilObj.updateTripleStoreWithBloomFilters(bfs, bloomfilterMapping, triplestoreURL, context); System.out.println("process time: " + (System.currentTimeMillis() - start)); Map<String, String> verification = new HashMap<>(); verification.putAll(utilObj.getBloomFiltersForMaps(triplestoreURL, context, triplemaps)); boolean verify = true; for (Entry<String, String> entry : verification.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); KR2RMLBloomFilter bf2 = new KR2RMLBloomFilter(KR2RMLBloomFilter.defaultVectorSize, KR2RMLBloomFilter.defaultnbHash, Hash.JENKINS_HASH); KR2RMLBloomFilter bf = bfs.get(key); bf2.populateFromCompressedAndBase64EncodedString(value); bf2.and(bf); bf2.xor(bf); try { Field f = BloomFilter.class.getDeclaredField("bits"); f.setAccessible(true); BitSet bits = (BitSet) f.get(bf2); if (bits.cardinality() != 0) { verify = false; break; } } catch (Exception e) { } } if (!verify) { utilObj.updateTripleStoreWithBloomFilters(bfs, verification, triplestoreURL, context); } } } private static Options createCommandLineOptions() { Options options = new Options(); options.addOption(new Option("filepath", "filepath", false, "location of the input file directory")); options.addOption(new Option("triplestoreurl", "triplestoreurl", true, "location of the triplestore")); options.addOption(new Option("context", "context", true, "the context uri")); options.addOption(new Option("help", "help", false, "print this message")); return options; } }