package textsearch;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.LockObtainFailedException;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Wrapper class for an embedded Lucene index to support.
*
* <p/>
* Copyright 2002-2012 by Mark Watson. All rights reserved.
* <p/>
* This software is can be used under either of the following licenses:
* <p/>
* 1. LGPL v3<br/>
* 2. Apache 2
* <p/>
*/
public class LuceneManager {
private String data_store_file_root;
/**
*
* @param data_store_file_root
*/
public LuceneManager(String data_store_file_root) {
this.data_store_file_root = data_store_file_root;
}
/**
* If you want to start with a fresh index, call this method immediately
* after creating a new instance of a LuceneManager.
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
*
*/
public void createAndClearLuceneIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
deleteFilePath(new File(data_store_file_root + "/lucene_index"));
File index_dir = new File(data_store_file_root + "/lucene_index");
new IndexWriter(index_dir, new StandardAnalyzer(), true).close();
}
/**
*
* @throws java.io.IOException
*/
public void close() throws IOException {
reader.close();
writer.close();
}
/**
*
* @param document_original_uri
* @param document_plain_text
* @throws org.apache.lucene.index.CorruptIndexException
* @throws java.io.IOException
*/
public void addDocumentToIndex(String document_original_uri, String document_plain_text) throws CorruptIndexException, IOException {
File index_dir = new File(data_store_file_root + "/lucene_index");
writer = new IndexWriter(index_dir, new StandardAnalyzer(), false);
Document doc = new Document();
doc.add(new Field("uri", document_original_uri, Field.Store.YES, Field.Index.NO));
doc.add(new Field("text", document_plain_text, Field.Store.YES, Field.Index.TOKENIZED));
writer.addDocument(doc);
writer.optimize();
writer.close();
}
/**
*
* @param search_query
* @return
* @throws org.apache.lucene.queryParser.ParseException
* @throws java.io.IOException
*/
public List<String> searchIndexForURIs(String search_query) throws ParseException, IOException {
reader = IndexReader.open(data_store_file_root + "/lucene_index");
List<String> ret = new ArrayList<String>();
Searcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
QueryParser parser = new QueryParser("text", analyzer);
Query query = parser.parse(search_query);
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++) {
System.out.println(" * * searchIndexForURIs: hit: " + hits.doc(i));
Document doc = hits.doc(i);
String uri = doc.get("uri");
ret.add(uri);
}
reader.close();
return ret;
}
/**
*
* @param search_query
* @return
* @throws java.lang.Exception
*/
public List<String[]> searchIndexForURIsAndDocText(String search_query) throws Exception {
reader = IndexReader.open(data_store_file_root + "/lucene_index");
List<String[]> ret = new ArrayList<String[]>();
Searcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
QueryParser parser = new QueryParser("text", analyzer);
Query query = parser.parse(search_query);
System.out.println(" * * test query: " + search_query);
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i += 1) {
Document doc = hits.doc(i);
System.out.println(" * * hit: " + hits.doc(i));
String [] pair = new String[]{doc.get("uri"), doc.get("text")};
ret.add(pair);
}
reader.close();
return ret;
}
private static final String fileSeparator = System.getProperty("file.separator");
private boolean deleteFilePath(File filePath) {
System.out.println("deleteFile(" + filePath + ")");
if (filePath == null) {
return false;
}
if (filePath.isDirectory()) {
String[] dirListing = filePath.list();
// For each file/directory in listing, make recursive call.
int len = dirListing.length;
for (int i = 0; i < len; i++) {
if (deleteFilePath(new File(filePath.toString() + fileSeparator + dirListing[i])) == false) {
// Break and return an error.
return false;
}
}
}
// Delete file or directory.
if (filePath.delete() == false) {
// Display message and return an error.
System.out.println("Could not delete: " + filePath.getAbsolutePath());
return false;
}
return true;
}
private IndexWriter writer;
private IndexReader reader;
/**
* @param args
* @throws Throwable
*/
public static void main(String[] args) throws Throwable {
LuceneManager lm = new LuceneManager("/tmp");
// start fresh: create a new index:
lm.createAndClearLuceneIndex();
lm.addDocumentToIndex("file://tmp/test1.txt", "This is a test for index and a test for search.");
lm.addDocumentToIndex("file://tmp/test2.txt", "Please test the index code.");
lm.addDocumentToIndex("file://tmp/test3.txt", "Please test the index code before tomorrow.");
// get URIs of matching documents:
List<String> doc_uris = lm.searchIndexForURIs("test, index");
System.out.println("Matched document URIs: " + doc_uris);
// get URIs and document text for matching documents:
List<String[]> doc_uris_with_text = lm.searchIndexForURIsAndDocText("test, index");
for (String[] uri_and_text : doc_uris_with_text) {
System.out.println("Matched document URI: " + uri_and_text[0]);
System.out.println(" document text: " + uri_and_text[1]);
}
}
}