/** * * The index manager controls the lucene indexing system. * * @license * */ package com.knowledgetree.indexer; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.StringReader; import java.util.Date; import java.util.Properties; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import java.beans.Beans; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.TokenGroup; import org.apache.lucene.search.TermQuery; import com.knowledgetree.lucene.KTLuceneServer; public class IndexerManager implements Formatter { public static final String KnowledgeTreeLoggingProperties = "KnowledgeTreeIndexer.Logging.properties"; private static IndexerManager indexingManager; private IndexReader queryReader; private Searcher querySearcher; private Analyzer analyzer; private ReentrantReadWriteLock locker; private Logger logger; private String indexDirectory = "../../../var/indexes"; private String propertiesFilename = "KnowledgeTreeIndexer.properties"; private String clientIps = "127.0.0.1"; private int maxQueryResult = 1000; private Date startDate; private int documentsAddCount = 0; private int documentsDeleteCount = 0; private int queryCount = 0; private int optimiseCount = 0; private int resultFragments = 3; private String resultSeperator = "..."; private int resultFragmentSize = 40; // basic getter() functions public Logger getLogger() { return logger; } /** * Indicates if the authentication token matches * * @param hash * @return */ public boolean authenticate(String token) { return KTLuceneServer.get().authenticate(token); } /** * Returns a reference to a singleton of the IndexerManager. * @return IndexerManager * @throws Exception */ public static IndexerManager get() throws Exception { if (null == IndexerManager.indexingManager) { IndexerManager.indexingManager = new IndexerManager(); } return IndexerManager.indexingManager; } /** * Returns the statistics on the indexer. The result is a JSONified string. * * @return String */ public String getStatistics() { StringBuilder jsonBuilder = new StringBuilder(); int numDocs = this.queryReader.numDocs(); jsonBuilder .append('{') .append("\"dateStarted\":\"").append(this.startDate).append("\",") .append("\"dateNow\":\"").append(new Date()).append("\",") .append("\"indexDirectory\":\"").append(this.indexDirectory).append("\",") .append("\"queryResultMax\":").append(this.maxQueryResult).append(",") .append("\"countAdded\":").append(this.documentsAddCount).append(",") .append("\"countDeleted\":").append(this.documentsDeleteCount).append(",") .append("\"countOptimised\":").append(this.optimiseCount).append(",") .append("\"countQuery\":").append(this.queryCount).append(",") .append("\"countDocuments\":").append(numDocs) .append('}'); return jsonBuilder.toString(); } /** * Gets analyzers from xml configuration file. * @throws XPathExpressionException */ private Analyzer getAnalyzer(String analyzerClass) throws Exception { Analyzer retval = null; Object bean = Beans.instantiate(getClass().getClassLoader(), analyzerClass); if (Beans.isInstanceOf(bean, Analyzer.class)) { retval = (Analyzer) Beans.getInstanceOf(bean, Analyzer.class); } return retval; } /** * Constructor for IndexerManager. * @throws Exception */ private IndexerManager() throws Exception { this.logger = Logger.getLogger("com.knowledgetree.lucene"); this.logger.info("Indexer starting up..."); //this.analyzer = new StandardAnalyzer(); this.locker = new ReentrantReadWriteLock(); this.startDate = new Date(); // load properties this.logger.info("Loading properties file: " + this.propertiesFilename); Properties properties = new Properties(); try { FileInputStream in = new FileInputStream(this.propertiesFilename); properties.load(in); in.close(); } catch(Exception ex) { this.logger.error("Problem loading properties: " + ex.getMessage()); throw ex; } this.analyzer = getAnalyzer(properties.getProperty("indexer.analyzer")); // test that the index folder exists and is writable this.indexDirectory = properties.getProperty("indexer.directory", this.indexDirectory); this.logger.info("Using index directory: " + this.indexDirectory); File dir = new File(this.indexDirectory); if (!dir.isDirectory()) { throw new Exception("Invalid index directory specified: " + this.indexDirectory); } if (!dir.canWrite() || !dir.canRead()) { throw new Exception("Index directory must be read and writable: " + this.indexDirectory); } this.maxQueryResult = Integer.parseInt(properties.getProperty("query.max.results", Integer.toString(this.maxQueryResult))); this.resultFragments = Integer.parseInt(properties.getProperty("result.fragments", Integer.toString(this.resultFragments))); this.resultSeperator = properties.getProperty("result.fragment.seperator", this.resultSeperator); this.resultFragmentSize = Integer.parseInt(properties.getProperty("result.fragment.size", Integer.toString(this.resultFragmentSize))); this.logger.info("Starting: " + this.startDate); this.logger.info("Client IPs: " + this.clientIps); this.logger.info("Max query result: " + this.maxQueryResult); this.logger.info("Result fragments: " + this.resultFragments); this.logger.info("Result fragment seperator: " + this.resultSeperator); this.logger.info("Result fragment size: " + this.resultFragmentSize); // open the index try { this.reopenIndex(); } catch(FileNotFoundException ex) { String msg = ex.getMessage(); if (msg.indexOf("no segments* file found") == 0) { this.logger.info("Suspect that this is first time that indexing is run. Will attempt to create segments in " + this.indexDirectory); this.create(); this.reopenIndex(); } else { throw ex; } } } /** * Closes any existing readers and reopens them. * @throws Exception */ private void reopenIndex() throws Exception { this.logger.debug("Reopenning index"); WriteLock lock = this.locker.writeLock(); lock.lock(); try { if (null != this.queryReader) { this.querySearcher.close(); this.queryReader.close(); } this.queryReader = IndexReader.open(this.indexDirectory); this.querySearcher = new IndexSearcher(this.queryReader); this.logger.debug("Timestamp: " + new Date()); this.logger.debug("Documents in index: " + this.queryReader.numDocs()); } finally { lock.unlock(); } } // some basic conversion helper structures final static char numc[] = {'0','1','2','3','4','5','6','7','8','9'}; final static char alphac[] = {'a','b','c','d','e','f','g','h','i','j'}; /** * Convert a long to a string * @param longv * @return String */ public static String longToString(long longv) { String s = Long.toString(longv); for(int i=0;i<10;i++) { s = s.replace(numc[i], alphac[i]); } return s; } /** * Convert a string to a long * @param sv * @return long */ public static long stringToLong(String sv) { for(int i=0;i<10;i++) { sv = sv.replace(alphac[i], numc[i]); } return Long.parseLong(sv); } /** * Identifies if the document has been indexed. * @param documentId * @return boolean * @throws IOException */ public boolean documentExists(int documentId) throws IOException { QueryParser parser=new QueryParser("DocumentID", this.analyzer); ReadLock lock = this.locker.readLock(); lock.lock(); try { try { Query query = new TermQuery(new Term("DocumentID",IndexerManager.longToString(documentId))); query=query.rewrite(this.queryReader); // run the search! Hits hits = this.querySearcher.search(query); boolean found = (hits.length() > 0); this.logger.debug("Checking document exists documentId=" +documentId + " result="+found); return found; } catch (IOException ex) { throw ex; } } finally { lock.unlock(); } } /** * Delete a document contained within the lucene index * * @param documentId * @throws Exception */ public void deleteDocument(int documentId) throws Exception { synchronized (this) { this.documentsDeleteCount++; } this.logger.debug("Deleting document: " + documentId); IndexReader reader = IndexReader.open(this.indexDirectory); int deleted = reader.deleteDocuments(new Term ("DocumentID", IndexerManager.longToString(documentId))); reader.close(); this.logger.debug("Deleted " + deleted + " documents."); this.reopenIndex(); } public void create() throws Exception { IndexWriter writer = new IndexWriter(this.indexDirectory, this.analyzer, true); writer.close(); } /** * Optimise the lucene database. * @throws Exception */ public void optimise() throws Exception { synchronized (this) { this.optimiseCount++; } this.logger.debug("Optimise index"); WriteLock lock = this.locker.writeLock(); lock.lock(); try { if (null != this.queryReader) { this.querySearcher.close(); this.queryReader.close(); } IndexWriter writer = new IndexWriter(this.indexDirectory, this.analyzer, false); writer.optimize(); writer.close(); this.queryReader = IndexReader.open(this.indexDirectory); this.querySearcher = new IndexSearcher(this.queryReader); } finally { lock.unlock(); } } /** * Pass a query to the database. This by default uses a maximum set of results. * @param queryString * @return QueryHit[] * @throws Exception */ public QueryHit[] query(String queryString) throws Exception { return this.query(queryString, this.maxQueryResult, false); } /** * Pass a query to the database. This by default uses a maximum set of results. * @param queryString * @param getText * @return QueryHit[] * @throws Exception */ public QueryHit[] query(String queryString, boolean getText) throws Exception { return this.query(queryString, this.maxQueryResult, getText); } /** * Returns a set of hits from lucene. * @param queryString * @param maxHits * @return * @throws Exception */ public QueryHit[] query(String queryString, int maxHits, boolean getText) throws Exception { synchronized (this) { this.queryCount++; } String tmp = queryString.toLowerCase(); boolean queryContent = tmp.indexOf("content") != -1; boolean queryDiscussion = tmp.indexOf("discussion") != -1; QueryParser parser=new QueryParser("Content", this.analyzer); Query query = parser.parse(queryString); // rewriting is important for complex queries. this is a must-do according to sources! query=query.rewrite(this.queryReader); // run the search! Hits hits = this.querySearcher.search(query); // now we can apply the maximum hits to the results we return! int max = (maxHits == -1)?hits.length():maxHits; if (hits.length() < max) { max = hits.length(); } QueryHit[] results = new QueryHit[max]; Highlighter highlighter =new Highlighter( this,new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(this.resultFragmentSize)); for (int i = 0; i < max; i++) { Document doc = hits.doc(i); QueryHit hit = new QueryHit(); hit.DocumentID = IndexerManager.stringToLong(doc.get("DocumentID")); hit.Rank = hits.score(i); hit.Title = doc.get("Title"); if (getText) { String text = ""; if (queryContent) { text += doc.get("Content"); } if (queryDiscussion) { text += doc.get("Discussion"); } // TODO: we can create a field.getReader(). the fragmenting needs to // be updated to deal with the reader only. would prefer not having to // load the document into a string! TokenStream tokenStream=analyzer.tokenStream("contents", new StringReader(text)); hit.Content = highlighter.getBestFragments(tokenStream,text,this.resultFragments,this.resultSeperator); } else { hit.Content = ""; } hit.Version = doc.get("Version"); results[i] = hit; } return results; } /** * Get text for a given document * * @param documentId * @return * @throws Exception */ public String getText(int documentId) throws Exception { QueryHit[] results = this.query("DocumentID:" + IndexerManager.longToString(documentId), true); return QueryHit.toJSON(results); } /** * Starts the indexing process. * * @param documentId * @param contentFilename * @param discussion * @param version * @throws Exception */ public void indexDocument(int documentId, String contentFilename, String discussion, String title, String version) throws Exception { synchronized (this) { this.documentsAddCount++; } this.logger.debug("Indexing document: documentid=" + documentId); // remove an existing document, if it exists. lucene doesn't do this for us! this.deleteDocument(documentId); File contentFile = new File(contentFilename); long filesize = contentFile.length(); byte buf[] = new byte[(int) filesize]; DataInputStream dis = new DataInputStream(new FileInputStream(contentFilename)); dis.read(buf, 0, (int) filesize); dis.close(); String content=new java.lang.String(buf, "UTF-8"); this.addLuceneDocument(documentId, content, discussion, title, version); // delete the temporary file contentFile.delete(); } /** * This adds a lucene document * * @param documentId * @param content * @param discussion * @param title * @param version * @throws Exception */ private void addLuceneDocument(int documentId, String content, String discussion, String title, String version) throws Exception { // create the lucene document Document document = new Document(); document.add(new Field("DocumentID", IndexerManager.longToString(documentId), Field.Store.YES, Field.Index.TOKENIZED)); document.add(new Field("Content", content, Field.Store.YES, Field.Index.TOKENIZED)); document.add(new Field("Discussion", discussion, Field.Store.YES, Field.Index.TOKENIZED)); document.add(new Field("Title", title, Field.Store.YES, Field.Index.TOKENIZED)); document.add(new Field("Version", version, Field.Store.YES, Field.Index.UN_TOKENIZED)); // add the document to lucene index try { this.logger.debug("Opening index writer: documentid=" + documentId); this.logger.debug("DocumentID: " + IndexerManager.longToString(documentId)); this.logger.debug("Content: " + content); this.logger.debug("Discussion: " + discussion); IndexWriter writer = new IndexWriter(this.indexDirectory, this.analyzer, false); writer.addDocument(document); writer.close(); this.logger.debug("Closing index writer: documentid=" + documentId); } catch (IOException ex) { logger.error("Problem indexing document: documentid=" + documentId + " with exception: " + ex.getMessage()); } this.reopenIndex(); } /** * Update the discussion on a document. * @param documentId * @param discussion * @throws Exception */ public void updateDiscussion(int documentId, String discussion) throws Exception { this.logger.debug("updateDiscussion: documentid=" + documentId); QueryParser parser=new QueryParser("DocumentID", this.analyzer); Query query = new TermQuery(new Term("DocumentID",IndexerManager.longToString(documentId))); query=query.rewrite(this.queryReader); // run the search! Hits hits = this.querySearcher.search(query); boolean found = false; for (int i = 0; i < hits.length(); i++) { Document doc = hits.doc(i); String content = doc.get("Content"); String title = doc.get("Title"); String version = doc.get("Version"); this.deleteDocument(documentId); this.addLuceneDocument(documentId, content, discussion, title, version); found = true; break; // there shouldn't be others... } if (!found) { // there is no content this.addLuceneDocument(documentId, "", discussion, "", ""); } } public String highlightTerm(String originalText , TokenGroup group) { if (group.getTotalScore() <= 0) { return originalText; } return "<b>" + originalText + "</b>"; } }