package edu.isi.karma.kr2rml.writer; import java.io.IOException; import java.nio.charset.Charset; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import org.apache.hadoop.util.bloom.Key; import org.apache.hadoop.util.hash.Hash; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import edu.isi.karma.kr2rml.mapping.R2RMLMappingIdentifier; import edu.isi.karma.modeling.Uris; public class KR2RMLBloomFilterManager { private static final Logger LOG = LoggerFactory.getLogger(KR2RMLBloomFilterManager.class); private static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); protected ConcurrentHashMap<String, KR2RMLBloomFilter> idToBloomFilter; protected R2RMLMappingIdentifier mappingIdentifier; public KR2RMLBloomFilterManager(R2RMLMappingIdentifier mappingIdentifier) { idToBloomFilter = new ConcurrentHashMap<>(); this.mappingIdentifier = mappingIdentifier; } public KR2RMLBloomFilterManager(JSONObject serializedManager) throws IOException { idToBloomFilter = new ConcurrentHashMap<>(); String idsConcatenated = serializedManager.getString("ids"); String[] ids = idsConcatenated.split(","); for(String id : ids) { String base64EncodedBloomFilter = serializedManager.getString(id); KR2RMLBloomFilter bf = new KR2RMLBloomFilter(); bf.populateFromCompressedAndBase64EncodedString(base64EncodedBloomFilter); idToBloomFilter.put(id, bf); } this.mappingIdentifier = new R2RMLMappingIdentifier(serializedManager.getJSONObject("mappingIdentifier")); } public KR2RMLBloomFilter getBloomFilter(String id) { return idToBloomFilter.get(id); } public void addUriToBloomFilter(String id, String uri) { KR2RMLBloomFilter bf = null; if(!idToBloomFilter.containsKey(id)) { idToBloomFilter.putIfAbsent(id, new KR2RMLBloomFilter(KR2RMLBloomFilter.defaultVectorSize, KR2RMLBloomFilter.defaultnbHash, Hash.JENKINS_HASH)); } bf = idToBloomFilter.get(id); Key k = new Key(uri.getBytes(UTF8_CHARSET)); bf.add(k); return; } public JSONObject toJSON() { JSONObject filters = new JSONObject(); StringBuffer ids = new StringBuffer(); for(Entry<String, KR2RMLBloomFilter> entry : idToBloomFilter.entrySet()) { String key = entry.getKey(); KR2RMLBloomFilter bf = entry.getValue(); try { String base64EncodedCompressedSerializedBloomFilter = bf.compressAndBase64Encode(); filters.put(key, base64EncodedCompressedSerializedBloomFilter); } catch (IOException e) { LOG.error("Unable to append bloom filter for id: " +key); continue; } if(ids.length() != 0) { ids.append(","); } ids.append(entry.getKey()); } filters.put("ids", ids.toString()); filters.put("mappingIdentifier", mappingIdentifier.toJSON()); return filters; } public String toRDF() { StringBuilder builder = new StringBuilder(); for(Entry<String, KR2RMLBloomFilter> entry : idToBloomFilter.entrySet()) { KR2RMLBloomFilter bf = entry.getValue(); String key = entry.getKey(); StringBuilder tripleBuilder = new StringBuilder(); tripleBuilder.append("<"); tripleBuilder.append(key); tripleBuilder.append("> <"); tripleBuilder.append(Uris.KM_HAS_BLOOMFILTER); tripleBuilder.append("> \""); try { String base64EncodedCompressedSerializedBloomFilter = bf.compressAndBase64Encode(); tripleBuilder.append(base64EncodedCompressedSerializedBloomFilter); } catch (IOException e) { LOG.error("Unable to append bloom filter for id: " + key); continue; } tripleBuilder.append("\" . \n"); builder.append(tripleBuilder); } return builder.toString(); } }