// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.standardization.index; import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.Set; import java.util.StringTokenizer; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.CheckIndex.Status; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.talend.dataquality.standardization.i18n.Messages; /** * @author scorreia, sizhaoliu A class to create an index with synonyms. */ public class SynonymIndexBuilder { private static final Logger LOG = Logger.getLogger(SynonymIndexBuilder.class); private Directory indexDir; /** * Default synonym separator is '|'. */ private char separator = '|'; private Analyzer analyzer; private IndexWriter writer; private final Error error = new Error(); /** * SynonymIndexBuilder constructor. */ public SynonymIndexBuilder() { } /** * Method "getError". * * @return the last error */ public Error getError() { return this.error; } /** * set a separator for a string which contains synonyms. * * @param synonymSeparator */ public void setSynonymSeparator(char synonymSeparator) { this.separator = synonymSeparator; } /** * Method "initIndexInFS" initializes the lucene index folder. * * @param path the path of the index (will be created if it does not exist) */ public void initIndexInFS(String path) { File file = new File(path); if (!file.exists()) { file.mkdirs(); } try { indexDir = FSDirectory.open(file); } catch (IOException e) { error.set(false, Messages.getString("SynonymIndexBuilder.failLoad"));//$NON-NLS-1$ } } /** * insert an entire document into index. * * @param word the reference word: must not be null * @param synonyms the list of synonyms separated by the separator (can be null) * @throws IOException */ public boolean insertDocument(String word, String synonyms) throws IOException { if (word.length() == 0) { error.set(false, Messages.getString("SynonymIndexBuilder.noRef"));//$NON-NLS-1$ return false; } // insert document without duplication verification getWriter().addDocument(generateDocument(word, synonyms)); return true; } /** * insert an entire document into index if it does not already exists. * * @param word the reference string * @param synonyms the synonyms (can be null) * @return true if inserted, false otherwise * @throws IOException */ public boolean insertDocumentIfNotExists(String word, String synonyms) throws IOException { if (searchDocumentByWord(word).totalHits == 0) { getWriter().addDocument(generateDocument(word, synonyms)); return true; } // else error.set(false, Messages.getString("SynonymIndexBuilder.aDocument", word));//$NON-NLS-1$ return false; } /** * Update an entire synonym document if and only if it exists and it's unique. * <p/> * WARNING If some changes in the index are not committed, this may cause trouble to find the document to update. * Make sure that a commit has been done before calling this method except if you know exactly what you are doing. * <p/> * WARNING! Beware that if several documents match the word, nothing will be done. * * @param word the reference word * @param synonyms the list of synonyms (can be null) * @throws IOException */ public int updateDocument(String word, String synonyms) throws IOException { int nbUpdatedDocuments = 0; TopDocs docs = searchDocumentByWord(word); switch (docs.totalHits) { case 0: break; case 1: getWriter().updateDocument(new Term(SynonymIndexSearcher.F_WORDTERM, word.trim().toLowerCase()), generateDocument(word, synonyms)); nbUpdatedDocuments = 1; break; default: nbUpdatedDocuments = -1;// to avoid insertion by the component when nbUpdatedDocuments == 0 error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$ break; } return nbUpdatedDocuments; } /** * delete an entire document by word. * * @param word * @throws IOException */ public int deleteDocumentByWord(String word) throws IOException { TopDocs docs = searchDocumentByWord(word); switch (docs.totalHits) { case 0: error.set(false, Messages.getString("SynonymIndexBuilder.doesnotExsit", word));//$NON-NLS-1$ return 0; case 1: getWriter().deleteDocuments(new Term(SynonymIndexSearcher.F_WORDTERM, word.trim().toLowerCase())); return 1; default: error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$ break; } return 0; } /** * delete all synonym documents. * * @throws IOException */ public void deleteAllDocuments() throws IOException { getWriter().deleteAll(); } /** * Add a synonym to an existing document. If several documents are found given the input word, nothing is done. If * the synonym is null, nothing is done. * * @param word a word (must not be null) * @param newSynonym the new synonym to add to the list of synonyms * @return 1 if added or 0 if no change has been done * @throws IOException */ public int addSynonymToDocument(String word, String newSynonym) throws IOException { if (newSynonym == null) { return 0; } // trim synonym String tempSynonym = newSynonym.trim(); if (tempSynonym.length() == 0) { return 0; } // reuse related synonym index search instead of created a new search SynonymIndexSearcher idxSearcher = getNewSynIdxSearcher(); TopDocs docs = idxSearcher.searchDocumentByWord(word); int nbDocs = 0; switch (docs.totalHits) { case 0: error.set(false, Messages.getString("SynonymIndexBuilder.document", word));//$NON-NLS-1$ break; case 1: // don't do anything if several documents match Document doc = idxSearcher.getDocument(docs.scoreDocs[0].doc); String[] synonyms = doc.getValues(SynonymIndexSearcher.F_SYN); Set<String> synonymList = new HashSet<String>(); boolean synExists = false; if (tempSynonym.equalsIgnoreCase(word)) { synExists = true; } for (String str : synonyms) { if (str.equalsIgnoreCase(tempSynonym)) { synExists = true; } synonymList.add(str); } // create a new document and replace the original one if synonym does not exist if (!synExists) { synonymList.add(tempSynonym); doc = generateDocument(doc.getValues(SynonymIndexSearcher.F_WORD)[0], synonymList); getWriter().updateDocument(new Term(SynonymIndexSearcher.F_WORDTERM, word.trim().toLowerCase()), doc); nbDocs = 1; } break; default: error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$ } // FIXME avoid use of idxSearcher? idxSearcher.close(); return nbDocs; } /** * remove a synonym from the document to which it belongs. * * @param synonymToDelete * @return the number of deleted synonyms * @throws IOException */ public int removeSynonymFromDocument(String word, String synonymToDelete) throws IOException { if (synonymToDelete == null) { error.set(false, Messages.getString("SynonymIndexBuilder.theSynonym", word));//$NON-NLS-1$ return 0; } String tempSynonymToDelete = synonymToDelete.trim(); if (tempSynonymToDelete.equalsIgnoreCase(word)) { error.set(false, Messages.getString("SynonymIndexBuilder.synonymToDelete", tempSynonymToDelete, word));//$NON-NLS-1$ return 0; } int deleted = 0; SynonymIndexSearcher newSynIdxSearcher = getNewSynIdxSearcher(); TopDocs docs = newSynIdxSearcher.searchDocumentByWord(word); switch (docs.totalHits) { case 0: error.set(false, Messages.getString("SynonymIndexBuilder.documentNotExsit", word));//$NON-NLS-1$ deleted = 0; break; case 1: Document doc = newSynIdxSearcher.getDocument(docs.scoreDocs[0].doc); String[] synonyms = doc.getValues(SynonymIndexSearcher.F_SYN); Set<String> synonymList = new HashSet<String>(); for (String str : synonyms) { if (str.equals(word)) { // do nothing. because the word will be added to the document // automatically in the method generateDocument(). } else if (str.equalsIgnoreCase(tempSynonymToDelete)) { // we don't require the synonymToDelete to be case sensitive. deleted++; } else { synonymList.add(str); } } // if the value of deleted is 0, we can know that the synonymToDelete doesn't exist if (deleted == 0) { error.set(false, Messages.getString("SynonymIndexBuilder.synonymNotExsit", tempSynonymToDelete));//$NON-NLS-1$ } else { doc = generateDocument(doc.getValues(SynonymIndexSearcher.F_WORD)[0], synonymList); getWriter().updateDocument(new Term(SynonymIndexSearcher.F_WORDTERM, word.toLowerCase()), doc); } break; default:// don't do anything if more than one document is found error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$ } newSynIdxSearcher.close(); return deleted; } /** * Method "deleteIndexFromFS". * * @param path the path of the index * @return true if the path is deleted (and if the path did not exist) */ public boolean deleteIndexFromFS(String path) { File folder = new File(path); if (!folder.exists()) { // folder does not exist. can create an index without deleting. return true; } if (folder.isDirectory()) { File[] filelist = folder.listFiles(); if (filelist.length == 0) {// folder is empty if (!folder.delete()) { error.set(false, Messages.getString("SynonymIndexBuilder.couldNotDelete", folder.getAbsolutePath()));//$NON-NLS-1$ return false; } } else { Status status = null; FSDirectory directory = null; try { directory = FSDirectory.open(folder); CheckIndex check = new CheckIndex(directory); status = check.checkIndex(); } catch (IOException e) { LOG.error(e); } finally { if (directory != null) { directory.close(); } } boolean allDeleted = true; if (status == null || status.missingSegments) { error.set(false, Messages.getString("SynonymIndexBuilder.notAnIndexFolder", folder.getAbsolutePath()));//$NON-NLS-1$ return false; } else {// an index already exists in folder for (File f : filelist) { if (!f.delete() && allDeleted) { allDeleted = false; } } if (allDeleted && !folder.delete()) { allDeleted = false; } if (!allDeleted) { error.set(false, Messages.getString("SynonymIndexBuilder.couldNotDelete", folder.getAbsolutePath()));//$NON-NLS-1$ return false; } } } } else {// folder is a file error.set(false, Messages.getString("SynonymIndexBuilder.pathIsFile", folder.getAbsolutePath()));//$NON-NLS-1$ return false; } return true; } /** * ADDED BY ytao 2011/02/11 If only need to initialize the index, do nothing after fold open, but just invoke this * method at the end, index will be reset. * <p/> * (Ensure that usingCreateMode is true) // where is it ensured? who wrote this sentence? * <p/> * Not sure that the index is deleted and recreated, may be just delete all documents of index since the index files * are "_1a.cfs" and "segments.gen" and "segments_1e" currently, if these files are not exists, API will not work. * <p/> * ADDED by sizhaoliu : usingCreateMode is not used any more. we now have a separated SynonymIndexSearcher. */ public void closeIndex() { try { this.getWriter().close(); } catch (CorruptIndexException e) { LOG.error(e); } catch (IOException e) { LOG.error(e); } } /** * Commits all pending changes. */ public void commit() { try { this.getWriter().commit(); } catch (CorruptIndexException e) { error.set(false, e.getMessage()); LOG.error(e); } catch (IOException e) { error.set(false, e.getMessage()); LOG.error(e); } } /** * Getter for analyzer. * * @return the analyzer * @throws IOException */ public Analyzer getAnalyzer() throws IOException { if (analyzer == null) { // the entry and the synonyms are indexed as provided // most used analyzer in lucene analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); } return this.analyzer; } /** * Getter for writer. * * @return the writer * @throws IOException * @throws */ IndexWriter getWriter() throws IOException { if (writer == null) { IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, this.getAnalyzer()); writer = new IndexWriter(indexDir, config); } return this.writer; } /** * Method "getNumDocs". * * @return the number of documents or -1 if an error happened */ public int getNumDocs() { try { return this.getWriter().numDocs(); } catch (IOException e) { LOG.error(e); return -1; } } private SynonymIndexSearcher getNewSynIdxSearcher() throws IOException { return new SynonymIndexSearcher(indexDir); } private Document generateDocument(String word, String synonyms) { Set<String> set = new HashSet<String>(); if (synonyms != null) { StringTokenizer tokenizer = new StringTokenizer(synonyms, String.valueOf(separator)); while (tokenizer.hasMoreTokens()) { set.add(tokenizer.nextToken()); } } return generateDocument(word, set); } /** * generate a document. * * @param word * @param synonyms * @return */ private Document generateDocument(String word, Set<String> synonyms) { String tempWord = word.trim(); Document doc = new Document(); FieldType ft = new FieldType(); ft.setStored(true); ft.setIndexed(true); ft.setOmitNorms(true); ft.freeze(); Field wordField = new Field(SynonymIndexSearcher.F_WORD, tempWord, ft); doc.add(wordField); Field wordTermField = new StringField(SynonymIndexSearcher.F_WORDTERM, tempWord.toLowerCase(), Field.Store.NO); doc.add(wordTermField); for (String syn : synonyms) { if (syn != null) { syn = syn.trim(); if (syn.length() > 0 && !syn.equals(tempWord)) { doc.add(new Field(SynonymIndexSearcher.F_SYN, syn, ft)); doc.add(new StringField(SynonymIndexSearcher.F_SYNTERM, syn.toLowerCase(), Field.Store.NO)); } } } return doc; } /** * search a document by the word. use only inside the builder. * * @param word * @return * @throws IOException */ private TopDocs searchDocumentByWord(String word) throws IOException { TopDocs docs = null; // FIXME can we avoid the creation of a new searcher (use IndexReader.reopen?) SynonymIndexSearcher newSynIdxSearcher = getNewSynIdxSearcher(); docs = newSynIdxSearcher.searchDocumentByWord(word); newSynIdxSearcher.close(); return docs; } }