// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.standardization.index;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.search.TopDocs;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* Unit tests for combined query.
*/
public class CombinedQueryTest {
private static final String PATH = "data/test_combined";
/**
* ATTENTION: Be careful when changing this list of synonyms, they are also use in SynonymIndexSearcherTest.
*/
private static String[][] synonyms = {
//
{ "Paris 5eme", "Paris 05 Panthéon|Paris 5|75005|some|other|synonyms" },
{ "Paris 2eme", "Paris 02 Bourse|Paris 2|75002" },
{ "Paris", "巴黎|Paryz|Parizh|Parizs|Paras|Pariz|Parigi|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T" },
//
{ "222", "" }, { "", "222" },
//
{ "111", "AA BB CC" }, { "222", "AA|BB CC|333" }, { "333", "AA BB|CC|DD|222" },
//
{ "222 333", "XXX|YYY|ZZZ" }, { "222 444", "XXX|YYY|ZZZ|WWW" }, { "YYY", "222 333" }, { "YYY", "222 444 | ZZZ" },
//
{ "XXX", "AA2|AA3|AA4|AA5|YYY" }, { "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" },
{ "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" }, { "A YYY ZZZ", "ZZZ|WWW" },
{ "XXX", "AA2|AA3|AA4|AA5|YYY" }, { "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" },
{ "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" }, { "A YYY ZZZ", "ZZZ|WWW" },
{ "XXX", "AA2|AA3|AA4|AA5|YYY" }, { "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" },
{ "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" }, { "A YYY ZZZ", "ZZZ|WWW" },
{ "XXX", "AA2|AA3|AA4|AA5|YYY" }, { "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" },
{ "A YYY ZZZ", "ZZZ|WWW" }, { "XXX", "AA2|AA3|AA4|AA5|YYY" }, };
public static final Map<String, List<Integer>> TEST_CASE_MAP = new LinkedHashMap<String, List<Integer>>() {
private static final long serialVersionUID = 1L;
{
put("PARIS", Arrays.asList(new Integer[] { 2, 0, 1 }));
put("222", Arrays.asList(new Integer[] { 3, 5, 6, 7, 8, 9, 10 }));
put("333", Arrays.asList(new Integer[] { 6, 5, 7, 9 }));
put("Aa", Arrays.asList(new Integer[] { 5, 4, 6 }));
put("Bb", Arrays.asList(new Integer[] { 4, 5, 6 }));
put("Cc", Arrays.asList(new Integer[] { 6, 4, 5 }));
put("Aa Bb", Arrays.asList(new Integer[] { 6, 4, 5 }));
put("Bb Cc", Arrays.asList(new Integer[] { 5, 4, 6 }));
put("Aa Cc", Arrays.asList(new Integer[] { 4, 5, 6 }));
put("Aa Bb Cc", Arrays.asList(new Integer[] { 4, 5, 6 }));
put("PARIS", Arrays.asList(new Integer[] { 2, 0, 1 }));
}
};
/**
* DOC sizhaoliu Comment method "setUp".
*
* @throws java.lang.Exception
*/
@BeforeClass
public static void setUp() throws Exception {
SynonymIndexBuilder builder = new SynonymIndexBuilder();
builder.deleteIndexFromFS(PATH);
// clear any existing files
assertEquals(builder.getError().getMessage(), true, builder.deleteIndexFromFS(PATH));
builder.initIndexInFS(PATH);
insertDocuments(builder);
builder.closeIndex();
}
static void insertDocuments(SynonymIndexBuilder build) throws IOException {
for (String[] syns : synonyms) {
build.insertDocument(syns[0], syns[1]);
}
build.commit();
}
@Test
public void testSearchDocumentBySynonym() throws IOException {
SynonymIndexSearcher searcher = getSearcher();
searcher.setTopDocLimit(10);
for (String word_to_search : TEST_CASE_MAP.keySet()) {
List<Integer> expectation = TEST_CASE_MAP.get(word_to_search);
System.out.println("\n-----------Looking for <" + word_to_search + ">-----------");
TopDocs docs = searcher.searchDocumentBySynonym(word_to_search);
System.out.print(docs.totalHits + " documents found.");
// assertEquals(3, docs.totalHits);
// assertEquals(true, searcher.getTopDocLimit() >= docs.scoreDocs.length);
for (int i = 0; i < docs.scoreDocs.length; i++) {
int docNumber = docs.scoreDocs[i].doc;
System.out.print("\ndoc=" + docNumber + "\tscore=" + docs.scoreDocs[i].score);
// Document doc = builder.getSearcher().doc(docs.scoreDocs[i].doc);
System.out.print(" \t" + searcher.getWordByDocNumber(docNumber));
System.out.print("\t-> [");
for (String syn : searcher.getSynonymsByDocNumber(docNumber)) {
System.out.print(syn + "|");
}
System.out.print("]");
assertEquals("Unexpected document classment", new Integer(expectation.get(i)), new Integer(docNumber));
}
}
searcher.close();
// TODO check that the best matching is the exact string.
// assertEquals("the best matching should be the exact string", 2,docs.scoreDocs[0].doc);
}
/**
* DOC scorreia Comment method "getSearcher".
*
* @return
*/
private SynonymIndexSearcher getSearcher() {
SynonymIndexSearcher searcher = new SynonymIndexSearcher();
try {
// searcher.setAnalyzer(builder.getAnalyzer());
searcher.openIndexInFS(PATH);
} catch (IOException e) {
e.printStackTrace();
}
searcher.setTopDocLimit(5);
return searcher;
}
}