package edu.uncc.cs.watsonsim; import java.io.IOException; import java.lang.reflect.Type; import java.nio.file.Paths; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.concurrent.ExecutionException; import java.util.function.Function; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.QueryBuilder; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.gson.Gson; import com.hp.hpl.jena.query.Dataset; import com.hp.hpl.jena.tdb.TDBFactory; /** * The NLP toolkit needs several shared resources, like text search indices * and database connections. Some can be shared between threads to save * memory, others should be independent. Also, configuration parameters * should all be entered in one place to keep it consistent between threads. * * So start an global environment by constructing it, and start a new thread * by using the newThread() method of the environment. * * The public fields of the Environment are intended for internal use by all * the NLP packages. Exercise great care before mutating anything. * * @author Sean Gallagher */ public class Environment extends Configuration { public final Database db; public final Dataset rdf; public final IndexSearcher lucene; private final QueryBuilder lucene_query_builder = new QueryBuilder(new StandardAnalyzer()); private static final Cache<String, ScoreDoc[]> recent_lucene_searches = CacheBuilder.newBuilder() .concurrencyLevel(50) .softValues() .maximumSize(1000) .build(); public final Log log = new Log(getClass(), System.out::println); /** * Create a (possibly) shared NLP environment. The given data directory * must be created (usually from a downloaded zipfile, check the README). * Expect many open files and many reads. Network filesystems are known to * perform poorly as data directories. Strive to use a local directory if * possible, or at least the Lucene indices otherwise. * * config.properties can be either in the data directory or the working * directory. This is to allow sharing (read-only) indices while still * allowing separate development configurations. */ public Environment() { // Now do some per-thread setup db = new Database(this); rdf = TDBFactory.assembleDataset( pathMustExist("rdf/jena-lucene.ttl")); // Lucene indexes have huge overhead so avoid re-instantiating by putting them in the Environment IndexReader reader; try { reader = DirectoryReader.open(new MMapDirectory(Paths.get(getConfOrDie("lucene_index")))); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException("The candidate-answer Lucene index failed to open."); } lucene = new IndexSearcher(reader); //lucene.setSimilarity(new BM25Similarity()); } /** * Run a vanilla boolean Lucene query * @param query * @param count * @return */ public ScoreDoc[] simpleLuceneQuery(String query, int count) { if (query.length() < 3) return new ScoreDoc[0]; try { return recent_lucene_searches.get(query, () -> forcedSimpleLuceneQuery(query, count)); } catch (ExecutionException e) { e.printStackTrace(); return new ScoreDoc[0]; } } /** * Run a vanilla boolean Lucene query * @param query Terms to query lucene with, using SHOULD (a kind of OR) * @param count The number of results to return * @return An array of ScoreDocs * @throws IOException * We */ private ScoreDoc[] forcedSimpleLuceneQuery(String query, int count) throws IOException { Query bquery = lucene_query_builder.createBooleanQuery("text", query, Occur.SHOULD); if (bquery != null) { return lucene.search(bquery, count).scoreDocs; } else { return new ScoreDoc[0]; } } /** * Evaluate a function with a long term persistent cache. It's slower and * more espensive than memcached but it is meant for very expensive * functions like searching Bing. * * @param key The unique key used to find the cache entry * @param func The function we are memoizing * @param dump Deserialize func's output * @param load Serialize func's output * @return Output of func(key) */ public synchronized <X> X computeIfAbsent(String key, Function<String, X> func, Type clazz) { try { // Check cache PreparedStatement general_cache_check = db.prep( "SELECT value, created_on FROM kv_cache " + "WHERE (key=?);"); general_cache_check.setString(1, key); ResultSet result = general_cache_check.executeQuery(); if (result.next()) { // Load cache return new Gson().fromJson(result.getString(1), clazz); } else { result.close(); general_cache_check.close(); // Fill cache PreparedStatement set_cache = db.prep( "INSERT INTO kv_cache (key, value) VALUES (?,?);"); X value = func.apply(key); set_cache.setString(1, key); set_cache.setString(2, new Gson().toJson(value)); set_cache.executeUpdate(); set_cache.close(); return value; } } catch (SQLException e) { // Oh no! Back to just evaluating. e.printStackTrace(); return func.apply(key); } } }