package eu.fbk.knowledgestore.datastore; import com.google.common.collect.Iterables; import eu.fbk.knowledgestore.data.Record; import eu.fbk.knowledgestore.data.Stream; import eu.fbk.knowledgestore.data.XPath; import eu.fbk.knowledgestore.runtime.*; import eu.fbk.knowledgestore.vocabulary.KS; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.store.FSDirectory; import org.openrdf.model.URI; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.*; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; /* * * TODO * - Optimize Lucene in background * - Sometimes the UI says that the IndexReader is closed * - Manage read only mode * - Introduce ReentrantReadWriteLock * http://docs.oracle.com/javase/7/docs/api/java/util/concurrent/locks/ReentrantReadWriteLock.html * http://stackoverflow.com/questions/18354339/reentrantreadwritelock-whats-the-difference-between-readlock-and-writelock * * */ /** * Created with IntelliJ IDEA. * User: alessio * Date: 08/09/14 * Time: 17:57 * To change this template use File | Settings | File Templates. */ public class LuceneDataStore implements DataStore { private String mentionsFolder; private String resourcesFolder; private HashMap<URI, IndexReader> readers = new HashMap<>(); private HashMap<URI, IndexWriter> writers = new HashMap<>(); private HashMap<URI, AtomicInteger> writingOperations = new HashMap<>(); private static final long DEFAULT_CLEANUP_PERIOD = 10000L; // 5s private final long cleanupPeriod; private SerializerAvro serializer; private final int MAX_LUCENE_SEGMENTS = 100; private static final HashMap<URI, Integer> OPTIMIZATION_THRESHOLD = new HashMap<>(); static { OPTIMIZATION_THRESHOLD.put(KS.RESOURCE, 1000); OPTIMIZATION_THRESHOLD.put(KS.MENTION, 10000); } static Logger logger = LoggerFactory.getLogger(LuceneDataStore.class); public LuceneDataStore(String folder, @Nullable SerializerAvro serializer) { this(folder, serializer, null); } public LuceneDataStore(String folder, @Nullable SerializerAvro serializer, @Nullable final Long cleanupPeriod) { this.mentionsFolder = folder + File.separator + "mentions"; this.resourcesFolder = folder + File.separator + "resources"; this.serializer = serializer; this.cleanupPeriod = cleanupPeriod != null ? cleanupPeriod : DEFAULT_CLEANUP_PERIOD; } public static byte[] serializeRecord(Record record, @Nullable SerializerAvro serializer) throws IOException { if (serializer == null) { ObjectOutput out = null; byte[] returnBytes; try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { out = new ObjectOutputStream(bos); out.writeObject(record); returnBytes = bos.toByteArray(); } finally { if (out != null) { out.close(); } } return returnBytes; } else { return serializer.toBytes(record); } } public static Record unserializeRecord(byte[] bytes, @Nullable SerializerAvro serializer) throws IOException { if (serializer == null) { ObjectInput in = null; Record returnRecord; try (ByteArrayInputStream bis = new ByteArrayInputStream(bytes)) { in = new ObjectInputStream(bis); try { returnRecord = (Record) in.readObject(); } catch (ClassNotFoundException e) { throw new IOException(e); } } finally { if (in != null) { in.close(); } } return returnRecord; } else { return (Record) serializer.fromBytes(bytes); } } public class LuceneTransaction implements DataTransaction { boolean readOnly; public static final String KEY_NAME = "key"; public static final String VALUE_NAME = "value"; public LuceneTransaction(boolean readOnly) throws IOException { this.readOnly = readOnly; // todo // readOnly = false; } private void optimize(URI type) throws IOException { if (!readers.get(type).isOptimized()) { synchronized (writingOperations.get(type)) { if (writingOperations.get(type).intValue() > OPTIMIZATION_THRESHOLD.get(type)) { logger.info("Optimizing index {}", type.toString()); writers.get(type).optimize(MAX_LUCENE_SEGMENTS); writingOperations.get(type).set(0); } } } readers.get(type).close(); readers.put(type, writers.get(type).getReader()); } @Override public Stream<Record> lookup(URI type, Set<? extends URI> ids, @Nullable Set<? extends URI> properties) throws IOException, IllegalArgumentException, IllegalStateException { optimize(type); List<Record> returns = new ArrayList<>(); for (URI id : ids) { String uri; try { uri = id.toString(); } catch (NullPointerException e) { throw new IOException(e); } logger.debug("Selecting {}", uri); Term s = new Term(KEY_NAME, uri); TermDocs termDocs = readers.get(type).termDocs(s); if (termDocs.next()) { Document doc = readers.get(type).document(termDocs.doc()); Record r = unserializeRecord(doc.getBinaryValue(VALUE_NAME), serializer); if (properties != null && !properties.isEmpty()) { r.retain(Iterables.toArray(properties, URI.class)); } returns.add(r); } } return Stream.create(returns); } @Override public Stream<Record> retrieve(URI type, @Nullable XPath condition, @Nullable Set<? extends URI> properties) throws IOException, IllegalArgumentException, IllegalStateException { optimize(type); List<Record> returns = new ArrayList<>(); for (int i = 0; i < readers.get(type).numDocs(); i++) { Document doc = readers.get(type).document(i); Record r = unserializeRecord(doc.getBinaryValue(VALUE_NAME), serializer); if (condition != null && !condition.evalBoolean(r)) { continue; } if (properties != null) { r.retain(Iterables.toArray(properties, URI.class)); } returns.add(r); } return Stream.create(returns); } @Override public long count(URI type, @Nullable XPath condition) throws IOException, IllegalArgumentException, IllegalStateException { optimize(type); if (condition == null) { return readers.get(type).numDocs(); } else { Stream<Record> stream = retrieve(type, condition, null); return stream.count(); } } @Override public Stream<Record> match(Map<URI, XPath> conditions, Map<URI, Set<URI>> ids, Map<URI, Set<URI>> properties) throws IOException, IllegalStateException { return null; //To change body of implemented methods use File | Settings | File Templates. } @Override public void store(URI type, Record record) throws IOException, IllegalStateException { writingOperations.get(type).incrementAndGet(); IndexWriter indexWriter = writers.get(type); String uri; try { uri = record.getID().toString(); } catch (NullPointerException e) { throw new IOException(e); } logger.debug(String.format("Inserting %s", uri)); Document doc = new Document(); doc.add(new Field(KEY_NAME, uri, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(VALUE_NAME, serializeRecord(record, serializer), Field.Store.YES)); // use "update" instead of "add" to avoid duplicates indexWriter.updateDocument(new Term(KEY_NAME, uri), doc); } @Override public void delete(URI type, URI id) throws IOException, IllegalStateException { writingOperations.get(type).incrementAndGet(); Term s = new Term(KEY_NAME, id.toString()); writers.get(type).deleteDocuments(s); writers.get(type).commit(); optimize(type); } @Override public void end(boolean commit) throws DataCorruptedException, IOException, IllegalStateException { // Nothing to do here } } @Override public DataTransaction begin(boolean readOnly) throws DataCorruptedException, IOException, IllegalStateException { LuceneTransaction ret = new LuceneTransaction(readOnly); return ret; } @Override public void init() throws IOException, IllegalStateException { Files.createDirectories(Paths.get(mentionsFolder)); Files.createDirectories(Paths.get(resourcesFolder)); writingOperations.put(KS.RESOURCE, new AtomicInteger(0)); writingOperations.put(KS.MENTION, new AtomicInteger(0)); try { writers.put(KS.RESOURCE, new IndexWriter(FSDirectory.open(new File(resourcesFolder)), new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED)); writers.put(KS.MENTION, new IndexWriter(FSDirectory.open(new File(mentionsFolder)), new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED)); // writers.get(KS.RESOURCE).setUseCompoundFile(true); // writers.get(KS.MENTION).setUseCompoundFile(true); writers.get(KS.RESOURCE).optimize(MAX_LUCENE_SEGMENTS); writers.get(KS.MENTION).optimize(MAX_LUCENE_SEGMENTS); readers.put(KS.RESOURCE, writers.get(KS.RESOURCE).getReader()); readers.put(KS.MENTION, writers.get(KS.MENTION).getReader()); } catch (Exception e) { logger.error(e.getMessage()); } } @Override public void close() { try { readers.get(KS.RESOURCE).close(); readers.get(KS.MENTION).close(); writers.get(KS.RESOURCE).close(); writers.get(KS.MENTION).close(); } catch (Exception e) { logger.error(e.getMessage()); } } private void resetOperationCount(URI type) { writingOperations.get(type).set(0); } }