/**
*
* The index manager controls the lucene indexing system.
*
* @license
*
*/
package com.knowledgetree.indexer;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.util.Date;
import java.util.Properties;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import java.beans.Beans;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.TermQuery;
import com.knowledgetree.lucene.KTLuceneServer;
public class IndexerManager implements Formatter
{
public static final String KnowledgeTreeLoggingProperties = "KnowledgeTreeIndexer.Logging.properties";
private static IndexerManager indexingManager;
private IndexReader queryReader;
private Searcher querySearcher;
private Analyzer analyzer;
private ReentrantReadWriteLock locker;
private Logger logger;
private String indexDirectory = "../../../var/indexes";
private String propertiesFilename = "KnowledgeTreeIndexer.properties";
private String clientIps = "127.0.0.1";
private int maxQueryResult = 1000;
private Date startDate;
private int documentsAddCount = 0;
private int documentsDeleteCount = 0;
private int queryCount = 0;
private int optimiseCount = 0;
private int resultFragments = 3;
private String resultSeperator = "...";
private int resultFragmentSize = 40;
// basic getter() functions
public Logger getLogger() { return logger; }
/**
* Indicates if the authentication token matches
*
* @param hash
* @return
*/
public boolean authenticate(String token)
{
return KTLuceneServer.get().authenticate(token);
}
/**
* Returns a reference to a singleton of the IndexerManager.
* @return IndexerManager
* @throws Exception
*/
public static IndexerManager get() throws Exception
{
if (null == IndexerManager.indexingManager)
{
IndexerManager.indexingManager = new IndexerManager();
}
return IndexerManager.indexingManager;
}
/**
* Returns the statistics on the indexer. The result is a JSONified string.
*
* @return String
*/
public String getStatistics()
{
StringBuilder jsonBuilder = new StringBuilder();
int numDocs = this.queryReader.numDocs();
jsonBuilder
.append('{')
.append("\"dateStarted\":\"").append(this.startDate).append("\",")
.append("\"dateNow\":\"").append(new Date()).append("\",")
.append("\"indexDirectory\":\"").append(this.indexDirectory).append("\",")
.append("\"queryResultMax\":").append(this.maxQueryResult).append(",")
.append("\"countAdded\":").append(this.documentsAddCount).append(",")
.append("\"countDeleted\":").append(this.documentsDeleteCount).append(",")
.append("\"countOptimised\":").append(this.optimiseCount).append(",")
.append("\"countQuery\":").append(this.queryCount).append(",")
.append("\"countDocuments\":").append(numDocs)
.append('}');
return jsonBuilder.toString();
}
/**
* Gets analyzers from xml configuration file.
* @throws XPathExpressionException
*/
private Analyzer getAnalyzer(String analyzerClass) throws Exception {
Analyzer retval = null;
Object bean = Beans.instantiate(getClass().getClassLoader(), analyzerClass);
if (Beans.isInstanceOf(bean, Analyzer.class)) {
retval = (Analyzer) Beans.getInstanceOf(bean, Analyzer.class);
}
return retval;
}
/**
* Constructor for IndexerManager.
* @throws Exception
*/
private IndexerManager() throws Exception
{
this.logger = Logger.getLogger("com.knowledgetree.lucene");
this.logger.info("Indexer starting up...");
//this.analyzer = new StandardAnalyzer();
this.locker = new ReentrantReadWriteLock();
this.startDate = new Date();
// load properties
this.logger.info("Loading properties file: " + this.propertiesFilename);
Properties properties = new Properties();
try
{
FileInputStream in = new FileInputStream(this.propertiesFilename);
properties.load(in);
in.close();
}
catch(Exception ex)
{
this.logger.error("Problem loading properties: " + ex.getMessage());
throw ex;
}
this.analyzer = getAnalyzer(properties.getProperty("indexer.analyzer"));
// test that the index folder exists and is writable
this.indexDirectory = properties.getProperty("indexer.directory", this.indexDirectory);
this.logger.info("Using index directory: " + this.indexDirectory);
File dir = new File(this.indexDirectory);
if (!dir.isDirectory())
{
throw new Exception("Invalid index directory specified: " + this.indexDirectory);
}
if (!dir.canWrite() || !dir.canRead())
{
throw new Exception("Index directory must be read and writable: " + this.indexDirectory);
}
this.maxQueryResult = Integer.parseInt(properties.getProperty("query.max.results", Integer.toString(this.maxQueryResult)));
this.resultFragments = Integer.parseInt(properties.getProperty("result.fragments", Integer.toString(this.resultFragments)));
this.resultSeperator = properties.getProperty("result.fragment.seperator", this.resultSeperator);
this.resultFragmentSize = Integer.parseInt(properties.getProperty("result.fragment.size", Integer.toString(this.resultFragmentSize)));
this.logger.info("Starting: " + this.startDate);
this.logger.info("Client IPs: " + this.clientIps);
this.logger.info("Max query result: " + this.maxQueryResult);
this.logger.info("Result fragments: " + this.resultFragments);
this.logger.info("Result fragment seperator: " + this.resultSeperator);
this.logger.info("Result fragment size: " + this.resultFragmentSize);
// open the index
try
{
this.reopenIndex();
}
catch(FileNotFoundException ex)
{
String msg = ex.getMessage();
if (msg.indexOf("no segments* file found") == 0)
{
this.logger.info("Suspect that this is first time that indexing is run. Will attempt to create segments in " + this.indexDirectory);
this.create();
this.reopenIndex();
}
else
{
throw ex;
}
}
}
/**
* Closes any existing readers and reopens them.
* @throws Exception
*/
private void reopenIndex() throws Exception
{
this.logger.debug("Reopenning index");
WriteLock lock = this.locker.writeLock();
lock.lock();
try
{
if (null != this.queryReader)
{
this.querySearcher.close();
this.queryReader.close();
}
this.queryReader = IndexReader.open(this.indexDirectory);
this.querySearcher = new IndexSearcher(this.queryReader);
this.logger.debug("Timestamp: " + new Date());
this.logger.debug("Documents in index: " + this.queryReader.numDocs());
}
finally
{
lock.unlock();
}
}
// some basic conversion helper structures
final static char numc[] = {'0','1','2','3','4','5','6','7','8','9'};
final static char alphac[] = {'a','b','c','d','e','f','g','h','i','j'};
/**
* Convert a long to a string
* @param longv
* @return String
*/
public static String longToString(long longv)
{
String s = Long.toString(longv);
for(int i=0;i<10;i++)
{
s = s.replace(numc[i], alphac[i]);
}
return s;
}
/**
* Convert a string to a long
* @param sv
* @return long
*/
public static long stringToLong(String sv)
{
for(int i=0;i<10;i++)
{
sv = sv.replace(alphac[i], numc[i]);
}
return Long.parseLong(sv);
}
/**
* Identifies if the document has been indexed.
* @param documentId
* @return boolean
* @throws IOException
*/
public boolean documentExists(int documentId) throws IOException
{
QueryParser parser=new QueryParser("DocumentID", this.analyzer);
ReadLock lock = this.locker.readLock();
lock.lock();
try
{
try
{
Query query = new TermQuery(new Term("DocumentID",IndexerManager.longToString(documentId)));
query=query.rewrite(this.queryReader);
// run the search!
Hits hits = this.querySearcher.search(query);
boolean found = (hits.length() > 0);
this.logger.debug("Checking document exists documentId=" +documentId + " result="+found);
return found;
}
catch (IOException ex)
{
throw ex;
}
}
finally
{
lock.unlock();
}
}
/**
* Delete a document contained within the lucene index
*
* @param documentId
* @throws Exception
*/
public void deleteDocument(int documentId) throws Exception
{
synchronized (this)
{
this.documentsDeleteCount++;
}
this.logger.debug("Deleting document: " + documentId);
IndexReader reader = IndexReader.open(this.indexDirectory);
int deleted = reader.deleteDocuments(new Term ("DocumentID", IndexerManager.longToString(documentId)));
reader.close();
this.logger.debug("Deleted " + deleted + " documents.");
this.reopenIndex();
}
public void create() throws Exception
{
IndexWriter writer = new IndexWriter(this.indexDirectory, this.analyzer, true);
writer.close();
}
/**
* Optimise the lucene database.
* @throws Exception
*/
public void optimise() throws Exception
{
synchronized (this)
{
this.optimiseCount++;
}
this.logger.debug("Optimise index");
WriteLock lock = this.locker.writeLock();
lock.lock();
try
{
if (null != this.queryReader)
{
this.querySearcher.close();
this.queryReader.close();
}
IndexWriter writer = new IndexWriter(this.indexDirectory, this.analyzer, false);
writer.optimize();
writer.close();
this.queryReader = IndexReader.open(this.indexDirectory);
this.querySearcher = new IndexSearcher(this.queryReader);
}
finally
{
lock.unlock();
}
}
/**
* Pass a query to the database. This by default uses a maximum set of results.
* @param queryString
* @return QueryHit[]
* @throws Exception
*/
public QueryHit[] query(String queryString) throws Exception
{
return this.query(queryString, this.maxQueryResult, false);
}
/**
* Pass a query to the database. This by default uses a maximum set of results.
* @param queryString
* @param getText
* @return QueryHit[]
* @throws Exception
*/
public QueryHit[] query(String queryString, boolean getText) throws Exception
{
return this.query(queryString, this.maxQueryResult, getText);
}
/**
* Returns a set of hits from lucene.
* @param queryString
* @param maxHits
* @return
* @throws Exception
*/
public QueryHit[] query(String queryString, int maxHits, boolean getText) throws Exception
{
synchronized (this)
{
this.queryCount++;
}
String tmp = queryString.toLowerCase();
boolean queryContent = tmp.indexOf("content") != -1;
boolean queryDiscussion = tmp.indexOf("discussion") != -1;
QueryParser parser=new QueryParser("Content", this.analyzer);
Query query = parser.parse(queryString);
// rewriting is important for complex queries. this is a must-do according to sources!
query=query.rewrite(this.queryReader);
// run the search!
Hits hits = this.querySearcher.search(query);
// now we can apply the maximum hits to the results we return!
int max = (maxHits == -1)?hits.length():maxHits;
if (hits.length() < max)
{
max = hits.length();
}
QueryHit[] results = new QueryHit[max];
Highlighter highlighter =new Highlighter( this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(this.resultFragmentSize));
for (int i = 0; i < max; i++)
{
Document doc = hits.doc(i);
QueryHit hit = new QueryHit();
hit.DocumentID = IndexerManager.stringToLong(doc.get("DocumentID"));
hit.Rank = hits.score(i);
hit.Title = doc.get("Title");
if (getText)
{
String text = "";
if (queryContent)
{
text += doc.get("Content");
}
if (queryDiscussion)
{
text += doc.get("Discussion");
}
// TODO: we can create a field.getReader(). the fragmenting needs to
// be updated to deal with the reader only. would prefer not having to
// load the document into a string!
TokenStream tokenStream=analyzer.tokenStream("contents", new StringReader(text));
hit.Content = highlighter.getBestFragments(tokenStream,text,this.resultFragments,this.resultSeperator);
}
else
{
hit.Content = "";
}
hit.Version = doc.get("Version");
results[i] = hit;
}
return results;
}
/**
* Get text for a given document
*
* @param documentId
* @return
* @throws Exception
*/
public String getText(int documentId) throws Exception
{
QueryHit[] results = this.query("DocumentID:" + IndexerManager.longToString(documentId), true);
return QueryHit.toJSON(results);
}
/**
* Starts the indexing process.
*
* @param documentId
* @param contentFilename
* @param discussion
* @param version
* @throws Exception
*/
public void indexDocument(int documentId, String contentFilename, String discussion, String title, String version) throws Exception
{
synchronized (this)
{
this.documentsAddCount++;
}
this.logger.debug("Indexing document: documentid=" + documentId);
// remove an existing document, if it exists. lucene doesn't do this for us!
this.deleteDocument(documentId);
File contentFile = new File(contentFilename);
long filesize = contentFile.length();
byte buf[] = new byte[(int) filesize];
DataInputStream dis = new DataInputStream(new FileInputStream(contentFilename));
dis.read(buf, 0, (int) filesize);
dis.close();
String content=new java.lang.String(buf, "UTF-8");
this.addLuceneDocument(documentId, content, discussion, title, version);
// delete the temporary file
contentFile.delete();
}
/**
* This adds a lucene document
*
* @param documentId
* @param content
* @param discussion
* @param title
* @param version
* @throws Exception
*/
private void addLuceneDocument(int documentId, String content, String discussion, String title, String version) throws Exception
{
// create the lucene document
Document document = new Document();
document.add(new Field("DocumentID", IndexerManager.longToString(documentId), Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("Content", content, Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("Discussion", discussion, Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("Title", title, Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("Version", version, Field.Store.YES, Field.Index.UN_TOKENIZED));
// add the document to lucene index
try
{
this.logger.debug("Opening index writer: documentid=" + documentId);
this.logger.debug("DocumentID: " + IndexerManager.longToString(documentId));
this.logger.debug("Content: " + content);
this.logger.debug("Discussion: " + discussion);
IndexWriter writer = new IndexWriter(this.indexDirectory, this.analyzer, false);
writer.addDocument(document);
writer.close();
this.logger.debug("Closing index writer: documentid=" + documentId);
}
catch (IOException ex)
{
logger.error("Problem indexing document: documentid=" + documentId + " with exception: " + ex.getMessage());
}
this.reopenIndex();
}
/**
* Update the discussion on a document.
* @param documentId
* @param discussion
* @throws Exception
*/
public void updateDiscussion(int documentId, String discussion) throws Exception
{
this.logger.debug("updateDiscussion: documentid=" + documentId);
QueryParser parser=new QueryParser("DocumentID", this.analyzer);
Query query = new TermQuery(new Term("DocumentID",IndexerManager.longToString(documentId)));
query=query.rewrite(this.queryReader);
// run the search!
Hits hits = this.querySearcher.search(query);
boolean found = false;
for (int i = 0; i < hits.length(); i++)
{
Document doc = hits.doc(i);
String content = doc.get("Content");
String title = doc.get("Title");
String version = doc.get("Version");
this.deleteDocument(documentId);
this.addLuceneDocument(documentId, content, discussion, title, version);
found = true;
break; // there shouldn't be others...
}
if (!found)
{
// there is no content
this.addLuceneDocument(documentId, "", discussion, "", "");
}
}
public String highlightTerm(String originalText , TokenGroup group)
{
if (group.getTotalScore() <= 0)
{
return originalText;
}
return "<b>" + originalText + "</b>";
}
}