package edu.unc.ils.mrc.hive.ir.lucene.search; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import edu.unc.ils.mrc.hive.ir.lucene.analysis.AutocompleteAnalyzer; /** * Preliminary autocompete implementation based on * http://stackoverflow.com/questions/120180/how-to-do-query-auto-completion-suggestions-in-lucene * * @author craig.willis@unc.edu */ public final class Autocomplete { /* Lucene index field to store concept ID */ private static final String ID_FIELD = "id"; /* Lucene index field to store the grammed words */ private static final String GRAMMED_WORDS_FIELD = "words"; /* Lucene index field to store the full source word */ private static final String SOURCE_WORD_FIELD = "sourceWord"; /* Lucene index field to store the word count */ private static final String COUNT_FIELD = "count"; /* Lucene index field to store the sort order */ private static final String SORT_FIELD = "sort"; /* Lucene directory for autocomplete index */ private Directory autoCompleteDirectory; private IndexReader autoCompleteReader; private IndexSearcher autoCompleteSearcher; public Autocomplete(String autoCompleteDir) throws IOException { this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir, null); if (exists(autoCompleteDir)) reOpenReader(); } public boolean exists(String autoCompleteDir) { File dir = new File(autoCompleteDir); String[] files = dir.list(); if (files != null && files.length > 0) return true; else return false; } /** * Returns a list of suggested terms for the specified string * @param str String to suggest terms for * @param numTerms Number of terms to return * @return * @throws IOException * @throws ParseException */ public List<AutocompleteTerm> suggestTermsFor(String str, int numTerms) throws IOException, ParseException { str = str.replaceAll(" ", ""); str = str.toLowerCase(); Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, str)); Sort sort = new Sort(SORT_FIELD, false); TopDocs docs = autoCompleteSearcher.search(query, null, numTerms, sort); List<AutocompleteTerm> suggestions = new ArrayList<AutocompleteTerm>(); for (ScoreDoc doc : docs.scoreDocs) { String id = autoCompleteReader.document(doc.doc).get( ID_FIELD); String label = autoCompleteReader.document(doc.doc).get( SOURCE_WORD_FIELD); AutocompleteTerm term = new AutocompleteTerm(id, label); suggestions.add(term); } return suggestions; } /** * Creates the autocomplete index from a source Lucene index. * @param sourceDirectory * @param fieldToAutocomplete * @throws CorruptIndexException * @throws IOException */ public void reIndex(Directory sourceDirectory, String fieldToAutocomplete) throws CorruptIndexException, IOException { IndexReader sourceReader = IndexReader.open(sourceDirectory); // use a custom analyzer so we can do EdgeNGramFiltering IndexWriter writer = new IndexWriter(autoCompleteDirectory, new AutocompleteAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); writer.setMergeFactor(300); writer.setMaxBufferedDocs(150); Map<String, Integer> wordsMap = new TreeMap<String, Integer>(); Map<String, String> idMap = new HashMap<String, String>(); for (int i = 0; i<sourceReader.numDocs(); i++) { try { Document d = sourceReader.document(i); String[] prefLabels = d.getValues("prefLabel"); String[] ids = d.getValues("id"); for (String prefLabel: prefLabels) { if (!wordsMap.containsKey(prefLabel)) { // use the number of documents this word appears in wordsMap.put(prefLabel, sourceReader.docFreq(new Term( fieldToAutocomplete, prefLabel))); idMap.put(prefLabel, ids[0]); } } } catch (Exception e) { e.printStackTrace(); } } int i = 0; for (String word : wordsMap.keySet()) { i++; // TODO: Need better way strip characters from terms. String nospaces = word.replaceAll(" ", ""); nospaces = nospaces.replaceAll("\\(", ""); nospaces = nospaces.replaceAll("\\)", ""); nospaces = nospaces.replaceAll("\\[", ""); nospaces = nospaces.replaceAll("\\]", ""); nospaces = nospaces.replaceAll("\\.", ""); nospaces = nospaces.replaceAll("-", ""); nospaces = nospaces.replaceAll(",", ""); nospaces = nospaces.replaceAll(":", ""); nospaces = nospaces.replaceAll("'", ""); // ok index the word Document doc = new Document(); doc.add(new Field(ID_FIELD, idMap.get(word), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term doc.add(new Field(GRAMMED_WORDS_FIELD, nospaces, Field.Store.YES, Field.Index.ANALYZED)); // grammed doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO, Field.Index.NOT_ANALYZED)); // count doc.add(new Field(SORT_FIELD, Integer.toString(i), Field.Store.NO, Field.Index.NOT_ANALYZED)); // count writer.addDocument(doc); } writer.commit(); sourceReader.close(); // close writer writer.optimize(); writer.close(); // re-open our reader reOpenReader(); } private void reOpenReader() throws CorruptIndexException, IOException { if (autoCompleteReader == null) { autoCompleteReader = IndexReader.open(autoCompleteDirectory); } else { autoCompleteReader.reopen(); } autoCompleteSearcher = new IndexSearcher(autoCompleteReader); } }