/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.models.occstore;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.mutable.MutableInt;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.WriteConcern;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.BulkWriteOptions;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.UpdateOptions;
import com.mongodb.client.model.Updates;
import com.mongodb.client.model.WriteModel;
import eu.project.ttc.models.Document;
import eu.project.ttc.models.OccurrenceStore;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.index.selectors.FrequencyUnderThreshholdSelector;
import eu.project.ttc.models.index.selectors.TermSelector;
public class MongoDBOccurrenceStore implements OccurrenceStore {
private static final Logger LOGGER = LoggerFactory.getLogger(MongoDBOccurrenceStore.class);
/**
* A monitor for {@link Executor}.
*
* @author Damien Cram
*
*/
public class MyMonitorThread implements Runnable
{
private BlockingThreadPoolExecutor executor;
private int seconds;
private boolean run=true;
public MyMonitorThread(BlockingThreadPoolExecutor executor, int delay)
{
this.executor = executor;
this.seconds=delay;
}
public void shutdown(){
this.run=false;
}
@Override
public void run()
{
while(run){
log();
try {
Thread.sleep(seconds*1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public void log() {
LOGGER.info(
String.format("[ThreadPoolExecutor monitor] [%d/%d] Active: %d, Queued: %d, Completed: %d, Task: %d, isShutdown: %s, isTerminated: %s",
this.executor.getPoolSize(),
this.executor.getCorePoolSize(),
this.executor.getQueue().size(),
this.executor.getActiveCount(),
this.executor.getCompletedTaskCount(),
this.executor.getTaskCount(),
this.executor.isShutdown(),
this.executor.isTerminated()));
}
}
// private static final String TERM_ID = "tid";
private static final String _ID = "_id";
private static final String DOC_ID = "did";
private static final String BEGIN = "ob";
private static final String END = "oe";
private static final String DOC_URL = "url";
protected static final String COVERED_TEXT = "t";
private static final String FREQUENCY = "f";
private static final String TERM_ID = "tid";
private MongoClientURI mongoDBUri;
private State state;
private MongoCollection<org.bson.Document> termCollection;
private MongoCollection<org.bson.Document> occurrenceCollection;
private MongoCollection<org.bson.Document> documentUrlCollection;
private Map<Integer, String> documentsUrls;
private List<TermOccurrence> occurrencesBuffer;
private Map<Term, MutableInt> termsBuffer;
private BlockingThreadPoolExecutor executor;
private MyMonitorThread monitor;
public MongoDBOccurrenceStore(String dbURI) {
this(dbURI, State.COLLECTING);
}
public MongoDBOccurrenceStore(String mongoDbUri, State state) {
super();
Preconditions.checkNotNull(mongoDbUri, "MongoDB dadabase's URI must not be null");
Preconditions.checkState(
state != State.INDEXING,
"Invalid occ store state for constructor. Only " + State.COLLECTING + " and " + State.INDEXED + " allowed"
);
this.mongoDBUri = getMongoDBUri(mongoDbUri);
this.state = state;
initThreadExecutor();
MongoClientURI connectionString = new MongoClientURI(mongoDbUri);
this.mongoClient = new MongoClient(connectionString);
MongoDatabase db = mongoClient.getDatabase(this.mongoDBUri.getDatabase())
.withWriteConcern(WriteConcern.ACKNOWLEDGED);
db.runCommand(new org.bson.Document("profile", 1));
if(state == State.COLLECTING)
db.drop();
this.termCollection = db.getCollection("terms");
this.occurrenceCollection = db.getCollection("occurrences");
this.documentUrlCollection = db.getCollection("documents");
resetBuffers();
}
private void initThreadExecutor() {
int blockingBound = 15; // the size of the blocking queue.
int maximumPoolSize = 10; // the max number of threads to execute
executor = new BlockingThreadPoolExecutor(
0,
maximumPoolSize,
1, TimeUnit.SECONDS,
blockingBound);
monitor = new MyMonitorThread(executor, 5);
new Thread(monitor).start();
}
private MongoClientURI getMongoDBUri(String mongoDbUri) {
if(mongoDbUri.startsWith("mongodb://"))
return new MongoClientURI(mongoDbUri);
else
// mongoDbUri is a db name
return new MongoClientURI("mongodb://localhost:27017/" + mongoDbUri);
}
private void resetBuffers() {
this.termsBuffer = Maps.newHashMap();
this.documentsUrls = Maps.newHashMap();
this.occurrencesBuffer = Lists.newArrayList();
}
private void checkState(State state) {
if(state != this.state)
throw new IllegalStateException("Current state is " + this.state + ". Expected state: " + state);
}
@Override
public Iterator<TermOccurrence> occurrenceIterator(Term term) {
return getOccurrences(term).iterator();
}
private LoadingCache<Integer, Document> documentCache = CacheBuilder.newBuilder()
.concurrencyLevel(1)
.maximumSize(10000)
.build(new CacheLoader<Integer, Document>() {
@Override
public Document load(Integer documentId) throws Exception {
org.bson.Document bsonDoc = documentUrlCollection.find(Filters.eq(_ID,documentId)).first();
return new Document(documentId, bsonDoc.getString(DOC_URL));
}
});
private LoadingCache<Term, List<TermOccurrence>> occurrenceCache = CacheBuilder.newBuilder()
.maximumSize(1000)
.build(new CacheLoader<Term, List<TermOccurrence>>() {
@Override
public List<TermOccurrence> load(Term term) throws Exception {
List<TermOccurrence> occurrences = Lists.newArrayList();
for(org.bson.Document occDoc:occurrenceCollection.find(Filters.eq(TERM_ID,term.getId()))) {
occurrences.add(new TermOccurrence(
term,
occDoc.getString(COVERED_TEXT),
documentCache.getUnchecked(occDoc.getInteger(DOC_ID)),
occDoc.getInteger(BEGIN),
occDoc.getInteger(END)
));
}
return occurrences;
}
});
private MongoClient mongoClient;
@Override
public Collection<TermOccurrence> getOccurrences(Term term) {
checkState(State.INDEXED);
return occurrenceCache.getUnchecked(term);
}
@Override
public void addOccurrence(Term term, TermOccurrence e) {
checkState(State.COLLECTING);
documentsUrls.put(e.getSourceDocument().getId(), e.getSourceDocument().getUrl());
MutableInt mutableInt = termsBuffer.get(term);
if(mutableInt == null)
termsBuffer.put(term, new MutableInt(1));
else
mutableInt.increment();
occurrencesBuffer.add(e);
}
@Override
public void addAllOccurrences(Term term, Collection<TermOccurrence> c) {
for(TermOccurrence occ:c)
addOccurrence(term, occ);
}
@Override
public Type getStoreType() {
return Type.MONGODB;
}
@Override
public String getUrl() {
return mongoDBUri.getURI();
}
@Override
public void flush() {
// bulk write occurrences
final List<org.bson.Document> occDocuments = Lists.newArrayListWithCapacity(occurrencesBuffer.size());
for(TermOccurrence o:this.occurrencesBuffer) {
occDocuments.add(new org.bson.Document()
.append(TERM_ID, o.getTerm().getId())
.append(DOC_ID, o.getSourceDocument().getId())
.append(BEGIN, o.getBegin())
.append(END, o.getEnd())
.append(COVERED_TEXT, o.getCoveredText())
);
}
if(!occurrencesBuffer.isEmpty())
executor.execute(new Runnable(){
public void run() {
occurrenceCollection.insertMany(occDocuments);
}
});
// bulk write documents
final List<WriteModel<org.bson.Document>> documentUrlsOps = Lists.newArrayListWithCapacity(documentsUrls.size());
for(Map.Entry<Integer, String> e:this.documentsUrls.entrySet()) {
UpdateOneModel<org.bson.Document> w = new UpdateOneModel<org.bson.Document>(
Filters.eq(_ID, e.getKey()),
Updates.set(DOC_URL, e.getValue()),
new UpdateOptions().upsert(true));
documentUrlsOps.add(w);
}
if(!documentUrlsOps.isEmpty())
executor.execute(new Runnable(){
public void run() {
documentUrlCollection.bulkWrite(documentUrlsOps, new BulkWriteOptions().ordered(false));
}
});
// bulk write terms
final List<WriteModel<org.bson.Document>> termsOps = Lists.newArrayList();
for(Term t:termsBuffer.keySet()) {
UpdateOneModel<org.bson.Document> w = new UpdateOneModel<org.bson.Document>(
Filters.eq(_ID, t.getId()),
Updates.inc(FREQUENCY, termsBuffer.get(t).intValue()),
new UpdateOptions().upsert(true));
termsOps.add(w);
}
if(!termsOps.isEmpty())
executor.execute(new Runnable(){
public void run() {
termCollection.bulkWrite(termsOps, new BulkWriteOptions().ordered(false));
}
});
resetBuffers();
}
@Override
public State getCurrentState() {
return this.state;
}
@Override
public void makeIndex() {
LOGGER.info("Indexing the occurrence store");
this.state = State.INDEXING;
flush();
sync();
LOGGER.debug("Removing orphan occurrences");
Set<Integer> tids = Sets.newHashSet();
for(org.bson.Document term:termCollection.find())
tids.add(term.getInteger(_ID));
occurrenceCollection.deleteMany(Filters.nin(TERM_ID, tids));
LOGGER.debug("creating index occurrences.{}", TERM_ID);
occurrenceCollection.createIndex(new org.bson.Document().append(TERM_ID, 1));
LOGGER.debug("Created");
monitor.shutdown();
this.state = State.INDEXED;
}
@Override
public void removeTerm(final Term t) {
executor.execute(new Runnable(){
public void run() {
termCollection.deleteOne(new org.bson.Document(_ID, t.getId()));
occurrenceCollection.deleteMany(Filters.eq(_ID, t.getId()));
}
});
}
@Override
public void close() {
sync();
monitor.shutdown();
mongoClient.close();
executor.shutdown();
}
@Override
public void deleteMany(TermSelector selector) {
if (selector instanceof FrequencyUnderThreshholdSelector) {
FrequencyUnderThreshholdSelector selector2 = (FrequencyUnderThreshholdSelector) selector;
sync();
Stopwatch sw = Stopwatch.createStarted();
termCollection.deleteMany(Filters.lt(FREQUENCY, selector2.getThreshhold()));
LOGGER.debug("Terms deleted in MongoDB in {}ms", sw.elapsed(TimeUnit.MILLISECONDS));
}
}
private void sync() {
LOGGER.info("Synchronizing with executor and mongoDB server");
monitor.log();
LOGGER.debug("Waiting for executor to finished queued tasks");
Stopwatch sw = Stopwatch.createStarted();
executor.sync();
LOGGER.debug("Executor synchronized in {}ms", sw.elapsed(TimeUnit.MILLISECONDS));
monitor.log();
sw = Stopwatch.createStarted();
LOGGER.debug("Synchronizing with MongoDB server");
mongoClient.fsync(false);
LOGGER.debug("MongoDB synchronized in {}ms", sw.elapsed(TimeUnit.MILLISECONDS));
}
}