// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.standardization.index;
import static org.junit.Assert.*;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.junit.Before;
import org.junit.Test;
/**
* DOC scorreia class global comment. Detailled comment
*
*/
public class SynonymIndexBuilderTest {
// The abosolute path will be "path/to/svn/top/org.talend.dataquality.standardization.test/data/index
static String path = "data/index";
/**
* ATTENTION: Be careful when changing this list of synonyms, they are also use in SynonymIndexSearcherTest.
*/
public static String[][] synonyms = { { "I.B.M.", "IBM|International Business Machines|Big Blue" },
{ "IRTY", "IBM|International Business Machines" }, { "ISDF", "IBM|International Business Machines|Big Blue" },
{ "ANPE", "A.N.P.E.|Agence Nationale Pour l'Emploi|Pôle Emploi" },
{ "TEST", "A.N.P.E.|Agence Nationale Pour l'Emploi|Pôle Emploi" }, { "Sécurité Sociale", "Sécu|SS|CPAM" },
{ "IAIDQ", "International Association for Information & Data Quality|Int. Assoc. Info & DQ" }, };
private static final boolean showInConsole = false;
// private SynonymIndexBuilder builder;
@Before
public void setUp() {
// clear any existing files
File folder = new File(path);
boolean deleteSuc = true;
if (folder.exists()) {
for (File f : folder.listFiles()) {
if (f.delete() == false) {
deleteSuc = false;
break;
}
}
if (!deleteSuc) {
path = path + "1"; //$NON-NLS-1$
setUp();
}
}
}
private void removePhisically(String filePath) {
File folder = new File(filePath);
if (folder.exists()) {
for (File f : folder.listFiles()) {
f.delete();
}
folder.delete();
}
}
SynonymIndexSearcher getSearcher(SynonymIndexBuilder builder) {
SynonymIndexSearcher searcher = new SynonymIndexSearcher();
try {
searcher.setAnalyzer(builder.getAnalyzer());
searcher.openIndexInFS(path);
} catch (IOException e) {
e.printStackTrace();
}
searcher.setTopDocLimit(5);
return searcher;
}
@Override
public void finalize() throws Exception {
//
// try {
// builder.closeIndex();
// } catch (Exception e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
}
void insertDocuments(SynonymIndexBuilder builder) throws IOException {
insertDocuments(builder, synonyms);
}
void insertDocuments(SynonymIndexBuilder builder, String[][] synonyms) throws IOException {
for (String[] syns : synonyms) {
builder.insertDocument(syns[0], syns[1]);
}
builder.commit();
}
private SynonymIndexBuilder createNewIndexBuilder(String p) {
SynonymIndexBuilder builder = new SynonymIndexBuilder();
builder.initIndexInFS(p);
return builder;
}
@Test
public void testInsertDocumentIfNotExists() throws Exception {
printLineToConsole("\n---------------Test addDocument------------------");
SynonymIndexBuilder builder = createNewIndexBuilder(path);
insertDocuments(builder);
// this.testInsertDocuments();// insert documents first
builder.insertDocumentIfNotExists("ADD", "This|is|a|new|document");
builder.commit();
SynonymIndexSearcher searcher = getSearcher(builder);
assertEquals(synonyms.length + 1, searcher.getNumDocs());
builder.insertDocumentIfNotExists("ANPE", "This|is|an|existing|document");
builder.commit();
searcher.close();
searcher = getSearcher(builder);
searcher.close();
assertEquals(synonyms.length + 1, searcher.getNumDocs());
builder.insertDocumentIfNotExists("Irish Bar Managers", "IBM");
builder.commit();
searcher = getSearcher(builder);
assertEquals(synonyms.length + 2, searcher.getNumDocs());
searcher.close();
builder.closeIndex();
}
@Test
public void testInsertDocuments() throws Exception {
printLineToConsole("\n---------------Test insertDocuments--------------");
SynonymIndexBuilder builder = createNewIndexBuilder(path);
insertDocuments(builder);
SynonymIndexSearcher searcher = getSearcher(builder);
assertEquals(synonyms.length, searcher.getNumDocs());
builder.closeIndex();
searcher.close();
}
@Test
public void testUpdateSynonymDocument() throws Exception {
printLineToConsole("\n---------------Test updateDocument---------------");
SynonymIndexBuilder builder = createNewIndexBuilder(path);
insertDocuments(builder);
SynonymIndexSearcher searcher = getSearcher(builder);
assertEquals(0, searcher.searchDocumentBySynonym("updated").totalHits);
builder.updateDocument("Sécurité Sociale", "I|have|been|updated");
builder.commit();
// close previous searcher
searcher.close();
searcher = getSearcher(builder);
assertEquals(1, searcher.searchDocumentBySynonym("updated").totalHits);
builder.updateDocument("INEXIST", "I|don't|exist");
builder.commit();
// close previous searcher
searcher.close();
searcher = getSearcher(builder);
assertEquals(0, searcher.searchDocumentBySynonym("exist").totalHits);
// close builders and searchers that are not used later
builder.closeIndex();
searcher.close();
}
@Test
public void testUpdateSynonymDocument2() throws Exception {
printLineToConsole("\n---------------Test updateDocument2---------------");
// --- create a new index with several similar documents
SynonymIndexBuilder synIdxBuild = new SynonymIndexBuilder();
String idxPath = "data/test_update";
removePhisically(idxPath);
synIdxBuild.deleteIndexFromFS(idxPath);
synIdxBuild.initIndexInFS(idxPath);
int maxDoc = 4;
String word = "salut";
for (int i = 0; i < maxDoc; i++) {
synIdxBuild.insertDocument(word, "synonym|toto");
}
String toupdate = "The document to update.";
synIdxBuild.insertDocument(toupdate, "this document will be updated");
int nbDocInIndex = maxDoc + 1;
assertEquals(nbDocInIndex, synIdxBuild.getNumDocs());
synIdxBuild.commit();
int nbUpdatedDocuments = synIdxBuild.updateDocument("unknown", "new syn");
assertEquals("there should be no document to update", 0, nbUpdatedDocuments);
assertEquals("The document should not be inserted here", nbDocInIndex, synIdxBuild.getNumDocs());
nbUpdatedDocuments = synIdxBuild.updateDocument(word, "new syn");
assertEquals("no update should be done because several documents match the word " + word, -1, nbUpdatedDocuments);
nbUpdatedDocuments = synIdxBuild.updateDocument(toupdate, "a new list of 3 synonyms|test|ok");
assertEquals("One document should be updated", 1, nbUpdatedDocuments);
synIdxBuild.commit();
synIdxBuild.closeIndex();
SynonymIndexSearcher search = new SynonymIndexSearcher();
search.setTopDocLimit(maxDoc); // retrieve all possible documents
search.openIndexInFS(idxPath);
TopDocs salutDocs = search.searchDocumentByWord(word);
assertEquals(maxDoc, salutDocs.totalHits);
for (ScoreDoc scoreDoc : salutDocs.scoreDocs) {
Document document = search.getDocument(scoreDoc.doc);
// [M]assertion removed: the order of synonyms is not important
// -sizhaoliu 08 Sep 2011
// String syn = document.get(SynonymIndexSearcher.F_SYN);
// assertEquals("the first synonym field should be the same as the word (after being analyzed)", word, syn);
String[] word_values = document.getValues(SynonymIndexSearcher.F_WORD);
String[] syn_values = document.getValues(SynonymIndexSearcher.F_SYN);
// expect to see "synonym" and "toto"
assertEquals(Arrays.asList(syn_values).toString(), 2, syn_values.length);
List<String> valueList = Arrays.asList(word_values);
assertTrue(valueList.contains(word));
valueList = Arrays.asList(syn_values);
assertTrue(valueList.contains("synonym"));
assertTrue(valueList.contains("toto"));
}
TopDocs updatedDocs = search.searchDocumentByWord(toupdate);
assertEquals("there should be only 1 document after the update", 1, updatedDocs.totalHits);
for (ScoreDoc scoreDoc : updatedDocs.scoreDocs) {
Document document = search.getDocument(scoreDoc.doc);
// [M]assertion removed: the order of synonyms is not important
// -sizhaoliu 08 Sep 2011
// String syn = document.get(SynonymIndexSearcher.F_SYN);
// assertEquals("the first synonym field should be the same as the word (after being analyzed)", toupdate,
// syn);
String[] word_values = document.getValues(SynonymIndexSearcher.F_WORD);
String[] values = document.getValues(SynonymIndexSearcher.F_SYN);
// expect to see "salut" and "synonym" and "toto"
assertEquals("there should be 3 synonyms", 3, values.length);
List<String> valueList = Arrays.asList(values);
assertTrue(Arrays.asList(word_values).contains(toupdate));
assertTrue(valueList.contains("a new list of 3 synonyms"));
assertTrue(valueList.contains("test"));
assertTrue(valueList.contains("ok"));
}
search.close();
}
@Test
public void testDeleteDocumentByWord() throws IOException {
printLineToConsole("\n---------------Test deleteDocument---------------");
SynonymIndexBuilder builder = createNewIndexBuilder(path);
insertDocuments(builder);
SynonymIndexSearcher searcher = getSearcher(builder);
int docCount = searcher.getNumDocs();
assertEquals(1, searcher.searchDocumentByWord("IAIDQ").totalHits);
searcher.close();
// the word to delete should be precise and case sensitive.
// builder.deleteDocumentByWord("iaidq");
// builder.commit();
//
// searcher = getSearcher(builder);
// assertEquals(docCount, searcher.getNumDocs());
// searcher.close();
builder.deleteDocumentByWord("IAIDQ");
builder.commit();
searcher = getSearcher(builder);
assertEquals(docCount - 1, searcher.getNumDocs());
searcher.close();
builder.deleteDocumentByWord("random");
builder.commit();
searcher = getSearcher(builder);
assertEquals(docCount - 1, searcher.getNumDocs());
// close previous searcher
searcher.close();
builder.closeIndex();
}
@Test
public void testAddSynonymToWord() throws IOException {
printLineToConsole("\n---------------Test addSynonymToWord-------------");
SynonymIndexBuilder builder = createNewIndexBuilder(path);
insertDocuments(builder);
SynonymIndexSearcher searcher = getSearcher(builder);
assertEquals(0, searcher.searchDocumentBySynonym("another").totalHits);
int originalSynonymCount = searcher.getSynonymCount("ANPE");
int addedSynonymToDocument = builder.addSynonymToDocument("ANPE", "Another synonym of ANPE");
builder.commit();
searcher.close();
assertEquals("1 new synonym should be appended to the list.", 1, addedSynonymToDocument);
searcher = getSearcher(builder);
assertEquals(1, searcher.searchDocumentBySynonym("another").totalHits);
assertEquals(++originalSynonymCount, searcher.getSynonymCount("ANPE"));
addedSynonymToDocument = builder.addSynonymToDocument("ANPE", "Anpe");
builder.commit();
searcher.close();
assertEquals("anpe already exists, no synonym should be appended to the list.", 0, addedSynonymToDocument);
searcher = getSearcher(builder);
assertEquals(originalSynonymCount, searcher.getSynonymCount("ANPE"));
builder.addSynonymToDocument("ANPEEEE", "A.N.P.E");
builder.commit();
searcher.close();
searcher = getSearcher(builder);
assertEquals(0, searcher.searchDocumentByWord("ANPEEEE").totalHits);
searcher.close();
builder.closeIndex();
}
@Test
public void testRemoveSynonymFromWord() throws IOException {
printLineToConsole("\n---------------Test removeSynonymFromWord-----------");
SynonymIndexBuilder builder = createNewIndexBuilder(path);
insertDocuments(builder);
SynonymIndexSearcher searcher = getSearcher(builder);
int synonymCount = searcher.getSynonymCount("ANPE");
// the synonym to delete should be precise and case sensitive
int removed = builder.removeSynonymFromDocument("ANPE", "Agence Nationale Pour l'Emploi");
assertEquals(1, removed);
builder.commit();
searcher.close();
searcher = getSearcher(builder);
assertEquals(--synonymCount, searcher.getSynonymCount("ANPE"));
removed = builder.removeSynonymFromDocument("ANPE", "Anpe");
assertEquals(0, removed);
builder.commit();
searcher.close();
searcher = getSearcher(builder);
assertEquals(synonymCount, searcher.getSynonymCount("ANPE"));
removed = builder.removeSynonymFromDocument("ANPE", "A.N.P.E.");
assertEquals(1, removed);
removed = builder.removeSynonymFromDocument("ANPE", "A.N.P.E.");
assertEquals("We did not commit, so we still should find a synonym to delete here", 1, removed);
builder.commit();
removed = builder.removeSynonymFromDocument("ANPE", "A.N.P.E.");
assertEquals(0, removed);
searcher.close();
searcher = getSearcher(builder);
assertEquals(--synonymCount, searcher.getSynonymCount("ANPE"));
removed = builder.removeSynonymFromDocument("ANPE", "Pôle Emploi");
assertEquals(1, removed);
builder.commit();
searcher.close();
searcher = getSearcher(builder);
assertEquals(--synonymCount, searcher.getSynonymCount("ANPE"));
removed = builder.removeSynonymFromDocument("ANPEEEE", "A.N.P.E");
assertEquals(0, removed);
searcher.close();
builder.closeIndex();
}
@Test
public void testDeleteAllDocuments() throws IOException {
printLineToConsole("\n---------------Test deleteAllDocuments----------");
SynonymIndexBuilder builder = createNewIndexBuilder(path);
insertDocuments(builder);
builder.deleteAllDocuments();
assertEquals(0, builder.getWriter().numDocs());
SynonymIndexSearcher searcher = getSearcher(builder);
assertEquals("A searcher should still see the documents as no commit has been done yet", false,
searcher.getNumDocs() == 0);
builder.commit();
assertEquals(
"The previous searcher should still see the documents as it still has a reader on the indexs before the commit has been done",
false, searcher.getNumDocs() == 0);
// builder.closeIndex();
searcher.close();
searcher = getSearcher(builder);
assertEquals("A new searcher should not see the documents anymore as a commit has been done", true,
searcher.getNumDocs() == 0);
searcher.close();
builder.closeIndex();
}
@Test
public void deleteIndexFromFS() throws IOException {
printLineToConsole("\n---------------Test deleteIndexFromFS----------");
String indexPath = "data/index2";
SynonymIndexBuilder synonymIndexBuilder = new SynonymIndexBuilder();
synonymIndexBuilder.initIndexInFS(indexPath);
File indexfile = new File(indexPath);
assertEquals(true, indexfile.exists());
// TODO test with lock?
synonymIndexBuilder.insertDocument("salut", "toto");
synonymIndexBuilder.commit();
synonymIndexBuilder.closeIndex();
synonymIndexBuilder.deleteIndexFromFS(indexPath);
// assertEquals(synonymIndexBuilder.getError().getMessage(), true, deleteIndexFromFS);
assertEquals(false, indexfile.exists());
}
@Test
public void initIndexInFS() throws IOException {
String indexPath = "data/index3";
SynonymIndexBuilder synonymIndexBuilder = new SynonymIndexBuilder();
synonymIndexBuilder.initIndexInFS(indexPath);
synonymIndexBuilder.insertDocument("salut", "toto");
synonymIndexBuilder.commit();
SynonymIndexSearcher searcher = new SynonymIndexSearcher(indexPath);
int numDocs = searcher.getNumDocs();
// check that two calls of initIndexInFS does not reset the index.
synonymIndexBuilder.initIndexInFS(indexPath);
synonymIndexBuilder.insertDocument("bye", "au revoir");
synonymIndexBuilder.commit();
// get a new searcher because the previous is open on the index when it contained only one document.
SynonymIndexSearcher searcher2 = new SynonymIndexSearcher(indexPath);
assertEquals(numDocs + 1, searcher2.getNumDocs());
synonymIndexBuilder.closeIndex();
searcher.close();
searcher2.close();
// when OS is windows delete the index will failed at here so that pass this test
String os = System.getProperties().getProperty("os.name");
if (!os.startsWith("win") && !os.startsWith("Win")) {
boolean deleted = synonymIndexBuilder.deleteIndexFromFS(indexPath);
assertEquals(true, deleted);
}
}
private void printLineToConsole(String text) {
if (showInConsole) {
System.out.println(text);
}
}
}