LuceneManager.java example

Explorer

Java-AI-Book-Code-master
- mr_temp
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ExtractNames.java
        util
        ScoredList.java
        Tokenizer.java
- src
  - database
    - CreateSampleDatabases.java
    - DumpMetaData.java
  - geneticalgorithm
    - Genetic.java
    - TestGenetic.java
  - markov
    - Markov.java
  - neuralnetworks
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ASpellWrapper.java
        AutoTagger.java
        ComparableDocument.java
        ExtractNames.java
        FastTag.java
        util
        NameValue.java
        NoiseWords.java
        RunExternal.java
        ScoredList.java
        Tokenizer.java
    - public_domain
      - Stemmer.java
  - opencalais
    - OpenCalaisClient.java
  - powerloom
  - search
  - semanticweb
  - spelling
    - jazzy
      - SpellingJazzyTester.java
    - norvig
      - SpellingSuggestions.java
    - norvigwordpairs
      - SpellingSuggestionsWordPairs.java
  - textsearch
  - weka
    - WekaStocks.java
  - wordnet
    - WordNetTest.java

package textsearch;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.LockObtainFailedException;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Wrapper class for an embedded Lucene index to support.
 * 
 * <p/>
 * Copyright 2002-2012 by Mark Watson. All rights reserved.
 * <p/>
 * This software is can be used under either of the following licenses:
 * <p/>
 * 1. LGPL v3<br/>
 * 2. Apache 2
 * <p/>
 */

public class LuceneManager {

  private String data_store_file_root;

  /**
   * 
   * @param data_store_file_root
   */
  public LuceneManager(String data_store_file_root) {
      this.data_store_file_root = data_store_file_root;
  }
  
  /**
   * If you want to start with a fresh index, call this method immediately
   * after creating a new instance of a LuceneManager.
   * @throws IOException 
   * @throws LockObtainFailedException 
   * @throws CorruptIndexException 
   * 
   */
  public void createAndClearLuceneIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
    deleteFilePath(new File(data_store_file_root + "/lucene_index"));
    File index_dir = new File(data_store_file_root + "/lucene_index");
    new IndexWriter(index_dir, new StandardAnalyzer(), true).close();
  }
  
  /**
   * 
   * @throws java.io.IOException
   */
  public void close() throws IOException {
      reader.close();
      writer.close();
  }
  /**
   * 
   * @param document_original_uri
   * @param document_plain_text
   * @throws org.apache.lucene.index.CorruptIndexException
   * @throws java.io.IOException
   */
  public void addDocumentToIndex(String document_original_uri, String document_plain_text) throws CorruptIndexException, IOException {
      File index_dir = new File(data_store_file_root + "/lucene_index");
      writer = new IndexWriter(index_dir, new StandardAnalyzer(), false);
      Document doc = new Document();
      doc.add(new Field("uri", document_original_uri, Field.Store.YES, Field.Index.NO));
      doc.add(new Field("text", document_plain_text, Field.Store.YES, Field.Index.TOKENIZED));
      writer.addDocument(doc);
      writer.optimize();
      writer.close();
  }
  /**
   * 
   * @param search_query
   * @return
   * @throws org.apache.lucene.queryParser.ParseException
   * @throws java.io.IOException
   */
  public List<String> searchIndexForURIs(String search_query) throws ParseException, IOException {
      reader = IndexReader.open(data_store_file_root + "/lucene_index");
      List<String> ret = new ArrayList<String>();
      Searcher searcher = new IndexSearcher(reader);
      Analyzer analyzer = new StandardAnalyzer();
      QueryParser parser = new QueryParser("text", analyzer);
      Query query = parser.parse(search_query);
      Hits hits = searcher.search(query);
      for (int i = 0; i < hits.length(); i++) {
          System.out.println(" * * searchIndexForURIs: hit: " + hits.doc(i));
          Document doc = hits.doc(i);
          String uri = doc.get("uri");
          ret.add(uri);
      }
      reader.close();
      return ret;
  }
  /**
   * 
   * @param search_query
   * @return
   * @throws java.lang.Exception
   */
  public List<String[]> searchIndexForURIsAndDocText(String search_query) throws Exception {
      reader = IndexReader.open(data_store_file_root + "/lucene_index");
      List<String[]> ret = new ArrayList<String[]>();
      Searcher searcher = new IndexSearcher(reader);
      Analyzer analyzer = new StandardAnalyzer();
      QueryParser parser = new QueryParser("text", analyzer);
      Query query = parser.parse(search_query);
      System.out.println(" * * test query: " + search_query);
      Hits hits = searcher.search(query);
      for (int i = 0; i < hits.length(); i += 1) {
        Document doc = hits.doc(i);
        System.out.println("     * *  hit: " + hits.doc(i));
        String [] pair = new String[]{doc.get("uri"), doc.get("text")};
        ret.add(pair);
      }
      reader.close();
      return ret;
  }
  
  private static final String fileSeparator = System.getProperty("file.separator");
  
  private boolean deleteFilePath(File filePath) {
    System.out.println("deleteFile(" + filePath + ")");
    if (filePath == null) {
        return false;
    }

    if (filePath.isDirectory()) {
        String[] dirListing = filePath.list();

        // For each file/directory in listing, make recursive call.
        int len = dirListing.length;
        for (int i = 0; i < len; i++) {
            if (deleteFilePath(new File(filePath.toString() + fileSeparator + dirListing[i])) == false) {
                // Break and return an error.
                return false;
            }
        }
    }

    // Delete file or directory.
    if (filePath.delete() == false) {
        // Display message and return an error.
        System.out.println("Could not delete: " + filePath.getAbsolutePath());
        return false;
    }

    return true;
}

  private IndexWriter writer;
  private IndexReader reader;

  /**
   * @param args
   * @throws Throwable 
   */
  public static void main(String[] args) throws Throwable {
    LuceneManager lm = new LuceneManager("/tmp");
    // start fresh: create a new index:
    lm.createAndClearLuceneIndex();
    lm.addDocumentToIndex("file://tmp/test1.txt", "This is a test for index and a test for search.");
    lm.addDocumentToIndex("file://tmp/test2.txt", "Please test the index code.");
    lm.addDocumentToIndex("file://tmp/test3.txt", "Please test the index code before tomorrow.");
    // get URIs of matching documents:
    List<String> doc_uris = lm.searchIndexForURIs("test, index");
    System.out.println("Matched document URIs: " + doc_uris);
    // get URIs and document text for matching documents:
    List<String[]> doc_uris_with_text = lm.searchIndexForURIsAndDocText("test, index");
    for (String[] uri_and_text : doc_uris_with_text) {
      System.out.println("Matched document URI:  " + uri_and_text[0]);
      System.out.println("        document text: " + uri_and_text[1]);
    }
    
  }

}