package edu.uncc.cs.watsonsim;
import java.io.IOException;
import java.lang.reflect.Type;
import java.nio.file.Paths;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.QueryBuilder;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.gson.Gson;
import com.hp.hpl.jena.query.Dataset;
import com.hp.hpl.jena.tdb.TDBFactory;
/**
* The NLP toolkit needs several shared resources, like text search indices
* and database connections. Some can be shared between threads to save
* memory, others should be independent. Also, configuration parameters
* should all be entered in one place to keep it consistent between threads.
*
* So start an global environment by constructing it, and start a new thread
* by using the newThread() method of the environment.
*
* The public fields of the Environment are intended for internal use by all
* the NLP packages. Exercise great care before mutating anything.
*
* @author Sean Gallagher
*/
public class Environment extends Configuration {
public final Database db;
public final Dataset rdf;
public final IndexSearcher lucene;
private final QueryBuilder lucene_query_builder = new QueryBuilder(new StandardAnalyzer());
private static final Cache<String, ScoreDoc[]> recent_lucene_searches =
CacheBuilder.newBuilder()
.concurrencyLevel(50)
.softValues()
.maximumSize(1000)
.build();
public final Log log = new Log(getClass(), System.out::println);
/**
* Create a (possibly) shared NLP environment. The given data directory
* must be created (usually from a downloaded zipfile, check the README).
* Expect many open files and many reads. Network filesystems are known to
* perform poorly as data directories. Strive to use a local directory if
* possible, or at least the Lucene indices otherwise.
*
* config.properties can be either in the data directory or the working
* directory. This is to allow sharing (read-only) indices while still
* allowing separate development configurations.
*/
public Environment() {
// Now do some per-thread setup
db = new Database(this);
rdf = TDBFactory.assembleDataset(
pathMustExist("rdf/jena-lucene.ttl"));
// Lucene indexes have huge overhead so avoid re-instantiating by putting them in the Environment
IndexReader reader;
try {
reader = DirectoryReader.open(new MMapDirectory(Paths.get(getConfOrDie("lucene_index"))));
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("The candidate-answer Lucene index failed to open.");
}
lucene = new IndexSearcher(reader);
//lucene.setSimilarity(new BM25Similarity());
}
/**
* Run a vanilla boolean Lucene query
* @param query
* @param count
* @return
*/
public ScoreDoc[] simpleLuceneQuery(String query, int count) {
if (query.length() < 3) return new ScoreDoc[0];
try {
return recent_lucene_searches.get(query, () -> forcedSimpleLuceneQuery(query, count));
} catch (ExecutionException e) {
e.printStackTrace();
return new ScoreDoc[0];
}
}
/**
* Run a vanilla boolean Lucene query
* @param query Terms to query lucene with, using SHOULD (a kind of OR)
* @param count The number of results to return
* @return An array of ScoreDocs
* @throws IOException
* We
*/
private ScoreDoc[] forcedSimpleLuceneQuery(String query, int count) throws IOException {
Query bquery = lucene_query_builder.createBooleanQuery("text", query, Occur.SHOULD);
if (bquery != null) {
return lucene.search(bquery, count).scoreDocs;
} else {
return new ScoreDoc[0];
}
}
/**
* Evaluate a function with a long term persistent cache. It's slower and
* more espensive than memcached but it is meant for very expensive
* functions like searching Bing.
*
* @param key The unique key used to find the cache entry
* @param func The function we are memoizing
* @param dump Deserialize func's output
* @param load Serialize func's output
* @return Output of func(key)
*/
public synchronized <X> X computeIfAbsent(String key,
Function<String, X> func,
Type clazz) {
try {
// Check cache
PreparedStatement general_cache_check = db.prep(
"SELECT value, created_on FROM kv_cache "
+ "WHERE (key=?);");
general_cache_check.setString(1, key);
ResultSet result = general_cache_check.executeQuery();
if (result.next()) {
// Load cache
return new Gson().fromJson(result.getString(1), clazz);
} else {
result.close();
general_cache_check.close();
// Fill cache
PreparedStatement set_cache = db.prep(
"INSERT INTO kv_cache (key, value) VALUES (?,?);");
X value = func.apply(key);
set_cache.setString(1, key);
set_cache.setString(2, new Gson().toJson(value));
set_cache.executeUpdate();
set_cache.close();
return value;
}
} catch (SQLException e) {
// Oh no! Back to just evaluating.
e.printStackTrace();
return func.apply(key);
}
}
}