// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.standardization.index;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.CheckIndex.Status;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.talend.dataquality.standardization.i18n.Messages;
/**
* @author scorreia, sizhaoliu A class to create an index with synonyms.
*/
public class SynonymIndexBuilder {
private static final Logger LOG = Logger.getLogger(SynonymIndexBuilder.class);
private Directory indexDir;
/**
* Default synonym separator is '|'.
*/
private char separator = '|';
private Analyzer analyzer;
private IndexWriter writer;
private final Error error = new Error();
/**
* SynonymIndexBuilder constructor.
*/
public SynonymIndexBuilder() {
}
/**
* Method "getError".
*
* @return the last error
*/
public Error getError() {
return this.error;
}
/**
* set a separator for a string which contains synonyms.
*
* @param synonymSeparator
*/
public void setSynonymSeparator(char synonymSeparator) {
this.separator = synonymSeparator;
}
/**
* Method "initIndexInFS" initializes the lucene index folder.
*
* @param path the path of the index (will be created if it does not exist)
*/
public void initIndexInFS(String path) {
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
try {
indexDir = FSDirectory.open(file);
} catch (IOException e) {
error.set(false, Messages.getString("SynonymIndexBuilder.failLoad"));//$NON-NLS-1$
}
}
/**
* insert an entire document into index.
*
* @param word the reference word: must not be null
* @param synonyms the list of synonyms separated by the separator (can be null)
* @throws IOException
*/
public boolean insertDocument(String word, String synonyms) throws IOException {
if (word.length() == 0) {
error.set(false, Messages.getString("SynonymIndexBuilder.noRef"));//$NON-NLS-1$
return false;
}
// insert document without duplication verification
getWriter().addDocument(generateDocument(word, synonyms));
return true;
}
/**
* insert an entire document into index if it does not already exists.
*
* @param word the reference string
* @param synonyms the synonyms (can be null)
* @return true if inserted, false otherwise
* @throws IOException
*/
public boolean insertDocumentIfNotExists(String word, String synonyms) throws IOException {
if (searchDocumentByWord(word).totalHits == 0) {
getWriter().addDocument(generateDocument(word, synonyms));
return true;
} // else
error.set(false, Messages.getString("SynonymIndexBuilder.aDocument", word));//$NON-NLS-1$
return false;
}
/**
* Update an entire synonym document if and only if it exists and it's unique.
* <p/>
* WARNING If some changes in the index are not committed, this may cause trouble to find the document to update.
* Make sure that a commit has been done before calling this method except if you know exactly what you are doing.
* <p/>
* WARNING! Beware that if several documents match the word, nothing will be done.
*
* @param word the reference word
* @param synonyms the list of synonyms (can be null)
* @throws IOException
*/
public int updateDocument(String word, String synonyms) throws IOException {
int nbUpdatedDocuments = 0;
TopDocs docs = searchDocumentByWord(word);
switch (docs.totalHits) {
case 0:
break;
case 1:
getWriter().updateDocument(new Term(SynonymIndexSearcher.F_WORDTERM, word.trim().toLowerCase()),
generateDocument(word, synonyms));
nbUpdatedDocuments = 1;
break;
default:
nbUpdatedDocuments = -1;// to avoid insertion by the component when nbUpdatedDocuments == 0
error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$
break;
}
return nbUpdatedDocuments;
}
/**
* delete an entire document by word.
*
* @param word
* @throws IOException
*/
public int deleteDocumentByWord(String word) throws IOException {
TopDocs docs = searchDocumentByWord(word);
switch (docs.totalHits) {
case 0:
error.set(false, Messages.getString("SynonymIndexBuilder.doesnotExsit", word));//$NON-NLS-1$
return 0;
case 1:
getWriter().deleteDocuments(new Term(SynonymIndexSearcher.F_WORDTERM, word.trim().toLowerCase()));
return 1;
default:
error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$
break;
}
return 0;
}
/**
* delete all synonym documents.
*
* @throws IOException
*/
public void deleteAllDocuments() throws IOException {
getWriter().deleteAll();
}
/**
* Add a synonym to an existing document. If several documents are found given the input word, nothing is done. If
* the synonym is null, nothing is done.
*
* @param word a word (must not be null)
* @param newSynonym the new synonym to add to the list of synonyms
* @return 1 if added or 0 if no change has been done
* @throws IOException
*/
public int addSynonymToDocument(String word, String newSynonym) throws IOException {
if (newSynonym == null) {
return 0;
}
// trim synonym
String tempSynonym = newSynonym.trim();
if (tempSynonym.length() == 0) {
return 0;
}
// reuse related synonym index search instead of created a new search
SynonymIndexSearcher idxSearcher = getNewSynIdxSearcher();
TopDocs docs = idxSearcher.searchDocumentByWord(word);
int nbDocs = 0;
switch (docs.totalHits) {
case 0:
error.set(false, Messages.getString("SynonymIndexBuilder.document", word));//$NON-NLS-1$
break;
case 1: // don't do anything if several documents match
Document doc = idxSearcher.getDocument(docs.scoreDocs[0].doc);
String[] synonyms = doc.getValues(SynonymIndexSearcher.F_SYN);
Set<String> synonymList = new HashSet<String>();
boolean synExists = false;
if (tempSynonym.equalsIgnoreCase(word)) {
synExists = true;
}
for (String str : synonyms) {
if (str.equalsIgnoreCase(tempSynonym)) {
synExists = true;
}
synonymList.add(str);
}
// create a new document and replace the original one if synonym does not exist
if (!synExists) {
synonymList.add(tempSynonym);
doc = generateDocument(doc.getValues(SynonymIndexSearcher.F_WORD)[0], synonymList);
getWriter().updateDocument(new Term(SynonymIndexSearcher.F_WORDTERM, word.trim().toLowerCase()), doc);
nbDocs = 1;
}
break;
default:
error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$
}
// FIXME avoid use of idxSearcher?
idxSearcher.close();
return nbDocs;
}
/**
* remove a synonym from the document to which it belongs.
*
* @param synonymToDelete
* @return the number of deleted synonyms
* @throws IOException
*/
public int removeSynonymFromDocument(String word, String synonymToDelete) throws IOException {
if (synonymToDelete == null) {
error.set(false, Messages.getString("SynonymIndexBuilder.theSynonym", word));//$NON-NLS-1$
return 0;
}
String tempSynonymToDelete = synonymToDelete.trim();
if (tempSynonymToDelete.equalsIgnoreCase(word)) {
error.set(false, Messages.getString("SynonymIndexBuilder.synonymToDelete", tempSynonymToDelete, word));//$NON-NLS-1$
return 0;
}
int deleted = 0;
SynonymIndexSearcher newSynIdxSearcher = getNewSynIdxSearcher();
TopDocs docs = newSynIdxSearcher.searchDocumentByWord(word);
switch (docs.totalHits) {
case 0:
error.set(false, Messages.getString("SynonymIndexBuilder.documentNotExsit", word));//$NON-NLS-1$
deleted = 0;
break;
case 1:
Document doc = newSynIdxSearcher.getDocument(docs.scoreDocs[0].doc);
String[] synonyms = doc.getValues(SynonymIndexSearcher.F_SYN);
Set<String> synonymList = new HashSet<String>();
for (String str : synonyms) {
if (str.equals(word)) {
// do nothing. because the word will be added to the document
// automatically in the method generateDocument().
} else if (str.equalsIgnoreCase(tempSynonymToDelete)) {
// we don't require the synonymToDelete to be case sensitive.
deleted++;
} else {
synonymList.add(str);
}
}
// if the value of deleted is 0, we can know that the synonymToDelete doesn't exist
if (deleted == 0) {
error.set(false, Messages.getString("SynonymIndexBuilder.synonymNotExsit", tempSynonymToDelete));//$NON-NLS-1$
} else {
doc = generateDocument(doc.getValues(SynonymIndexSearcher.F_WORD)[0], synonymList);
getWriter().updateDocument(new Term(SynonymIndexSearcher.F_WORDTERM, word.toLowerCase()), doc);
}
break;
default:// don't do anything if more than one document is found
error.set(false, Messages.getString("SynonymIndexBuilder.documents", docs.totalHits, word));//$NON-NLS-1$
}
newSynIdxSearcher.close();
return deleted;
}
/**
* Method "deleteIndexFromFS".
*
* @param path the path of the index
* @return true if the path is deleted (and if the path did not exist)
*/
public boolean deleteIndexFromFS(String path) {
File folder = new File(path);
if (!folder.exists()) {
// folder does not exist. can create an index without deleting.
return true;
}
if (folder.isDirectory()) {
File[] filelist = folder.listFiles();
if (filelist.length == 0) {// folder is empty
if (!folder.delete()) {
error.set(false, Messages.getString("SynonymIndexBuilder.couldNotDelete", folder.getAbsolutePath()));//$NON-NLS-1$
return false;
}
} else {
Status status = null;
FSDirectory directory = null;
try {
directory = FSDirectory.open(folder);
CheckIndex check = new CheckIndex(directory);
status = check.checkIndex();
} catch (IOException e) {
LOG.error(e);
} finally {
if (directory != null) {
directory.close();
}
}
boolean allDeleted = true;
if (status == null || status.missingSegments) {
error.set(false, Messages.getString("SynonymIndexBuilder.notAnIndexFolder", folder.getAbsolutePath()));//$NON-NLS-1$
return false;
} else {// an index already exists in folder
for (File f : filelist) {
if (!f.delete() && allDeleted) {
allDeleted = false;
}
}
if (allDeleted && !folder.delete()) {
allDeleted = false;
}
if (!allDeleted) {
error.set(false, Messages.getString("SynonymIndexBuilder.couldNotDelete", folder.getAbsolutePath()));//$NON-NLS-1$
return false;
}
}
}
} else {// folder is a file
error.set(false, Messages.getString("SynonymIndexBuilder.pathIsFile", folder.getAbsolutePath()));//$NON-NLS-1$
return false;
}
return true;
}
/**
* ADDED BY ytao 2011/02/11 If only need to initialize the index, do nothing after fold open, but just invoke this
* method at the end, index will be reset.
* <p/>
* (Ensure that usingCreateMode is true) // where is it ensured? who wrote this sentence?
* <p/>
* Not sure that the index is deleted and recreated, may be just delete all documents of index since the index files
* are "_1a.cfs" and "segments.gen" and "segments_1e" currently, if these files are not exists, API will not work.
* <p/>
* ADDED by sizhaoliu : usingCreateMode is not used any more. we now have a separated SynonymIndexSearcher.
*/
public void closeIndex() {
try {
this.getWriter().close();
} catch (CorruptIndexException e) {
LOG.error(e);
} catch (IOException e) {
LOG.error(e);
}
}
/**
* Commits all pending changes.
*/
public void commit() {
try {
this.getWriter().commit();
} catch (CorruptIndexException e) {
error.set(false, e.getMessage());
LOG.error(e);
} catch (IOException e) {
error.set(false, e.getMessage());
LOG.error(e);
}
}
/**
* Getter for analyzer.
*
* @return the analyzer
* @throws IOException
*/
public Analyzer getAnalyzer() throws IOException {
if (analyzer == null) {
// the entry and the synonyms are indexed as provided
// most used analyzer in lucene
analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
}
return this.analyzer;
}
/**
* Getter for writer.
*
* @return the writer
* @throws IOException
* @throws
*/
IndexWriter getWriter() throws IOException {
if (writer == null) {
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, this.getAnalyzer());
writer = new IndexWriter(indexDir, config);
}
return this.writer;
}
/**
* Method "getNumDocs".
*
* @return the number of documents or -1 if an error happened
*/
public int getNumDocs() {
try {
return this.getWriter().numDocs();
} catch (IOException e) {
LOG.error(e);
return -1;
}
}
private SynonymIndexSearcher getNewSynIdxSearcher() throws IOException {
return new SynonymIndexSearcher(indexDir);
}
private Document generateDocument(String word, String synonyms) {
Set<String> set = new HashSet<String>();
if (synonyms != null) {
StringTokenizer tokenizer = new StringTokenizer(synonyms, String.valueOf(separator));
while (tokenizer.hasMoreTokens()) {
set.add(tokenizer.nextToken());
}
}
return generateDocument(word, set);
}
/**
* generate a document.
*
* @param word
* @param synonyms
* @return
*/
private Document generateDocument(String word, Set<String> synonyms) {
String tempWord = word.trim();
Document doc = new Document();
FieldType ft = new FieldType();
ft.setStored(true);
ft.setIndexed(true);
ft.setOmitNorms(true);
ft.freeze();
Field wordField = new Field(SynonymIndexSearcher.F_WORD, tempWord, ft);
doc.add(wordField);
Field wordTermField = new StringField(SynonymIndexSearcher.F_WORDTERM, tempWord.toLowerCase(), Field.Store.NO);
doc.add(wordTermField);
for (String syn : synonyms) {
if (syn != null) {
syn = syn.trim();
if (syn.length() > 0 && !syn.equals(tempWord)) {
doc.add(new Field(SynonymIndexSearcher.F_SYN, syn, ft));
doc.add(new StringField(SynonymIndexSearcher.F_SYNTERM, syn.toLowerCase(), Field.Store.NO));
}
}
}
return doc;
}
/**
* search a document by the word. use only inside the builder.
*
* @param word
* @return
* @throws IOException
*/
private TopDocs searchDocumentByWord(String word) throws IOException {
TopDocs docs = null;
// FIXME can we avoid the creation of a new searcher (use IndexReader.reopen?)
SynonymIndexSearcher newSynIdxSearcher = getNewSynIdxSearcher();
docs = newSynIdxSearcher.searchDocumentByWord(word);
newSynIdxSearcher.close();
return docs;
}
}