// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.standardization.query;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.talend.dataquality.standardization.constant.PluginConstant;
/**
* DOC scorreia class global comment. Detailled comment
*/
public class FirstNameStandardize {
private static final Logger LOG = Logger.getLogger(FirstNameStandardize.class);
/**
* According to levenshtein algorithm, the following value means a distance of 1 is allowed to match a first name
* containing 4 to 7 letters, while for those with 8 to 12 letters, 2 erroneous letters are allowed. a first name
* between 12 and 15 letters, allows a distance of 3, and so on ...
* <p>
* For the first names with minus sign inside, ex: Jean-Baptiste, the matching is done for Jean and Baptiste separately, and
* the number of tokens is also considered by Lucene.
*/
@Deprecated
private static final float MATCHING_SIMILARITY = 0.74f;
private int maxEdits = 1;
private Analyzer analyzer;
private IndexSearcher searcher;
private int hitsPerPage;
public FirstNameStandardize(IndexSearcher indexSearcher, Analyzer analyzer, int hitsPerPage) throws IOException {
assert analyzer != null;
assert indexSearcher != null;
this.analyzer = analyzer;
this.searcher = indexSearcher;
this.hitsPerPage = hitsPerPage;
}
@Deprecated
private ScoreDoc[] standardize(String input, boolean fuzzyQuery) throws ParseException, IOException {
if (input == null || input.length() == 0) {
return new ScoreDoc[0];
}
// MOD sizhaoliu 2012-7-4 TDQ-1576 tFirstnameMatch returns no firstname when several matches exist
// Do not use doc collector which contains an inner sort.
ScoreDoc[] matches = null;
if (fuzzyQuery) {
try {
matches = getFuzzySearch(input).scoreDocs;
} catch (Exception e) {
LOG.error(e, e);
}
} else {
Query q = new QueryParser(PluginConstant.FIRST_NAME_STANDARDIZE_NAME, analyzer).parse(input);
matches = searcher.search(q, 10).scoreDocs;
}
return matches;
}
public void getFuzzySearch(String input, TopDocsCollector<?> collector) throws Exception {
Query q = new FuzzyQuery(new Term(PluginConstant.FIRST_NAME_STANDARDIZE_NAME, input));
Query qalias = new FuzzyQuery(new Term(PluginConstant.FIRST_NAME_STANDARDIZE_ALIAS, input));
BooleanQuery combinedQuery = new BooleanQuery();
combinedQuery.add(q, BooleanClause.Occur.SHOULD);
combinedQuery.add(qalias, BooleanClause.Occur.SHOULD);
searcher.search(combinedQuery, collector);
}
private TopDocs getFuzzySearch(String input) throws Exception {
// MOD sizhaoliu 2012-7-4 TDQ-1576 tFirstnameMatch returns no firstname when several matches exist
// The 2 letter prefix requires exact match while the word to search may not be lowercased as in the index.
// Extracted and documented MATCHING_SIMILARITY constant.
Query q = new FuzzyQuery(new Term("name", input.toLowerCase()), maxEdits);//$NON-NLS-1$
TopDocs matches = searcher.search(q, 10);
return matches;
}
// FIXME this variable is only for tests
public static final boolean SORT_WITH_COUNT = true;
private Query getTermQuery(String field, String text, boolean fuzzy) {
Term term = new Term(field, text);
return fuzzy ? new FuzzyQuery(term, maxEdits) : new TermQuery(term);
}
private List<String> getTokensFromAnalyzer(String input) throws IOException {
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(result);
CharTermAttribute charTermAttribute = result.addAttribute(CharTermAttribute.class);
tokenStream.reset();
List<String> termList = new ArrayList<String>();
while (result.incrementToken()) {
String term = charTermAttribute.toString();
termList.add(term);
}
result.close();
return termList;
}
public ScoreDoc[] standardize(String inputName, Map<String, String> information2value, boolean fuzzySearch)
throws IOException {
if (inputName == null || inputName.length() == 0) {
return new ScoreDoc[0];
}
// // DOC set get county and gender fields value
String countryText = null;
String genderText = null;
if (information2value != null) {
countryText = information2value.get(PluginConstant.FIRST_NAME_STANDARDIZE_COUNTRY);
genderText = information2value.get(PluginConstant.FIRST_NAME_STANDARDIZE_GENDER);
}
BooleanQuery combinedQuery = new BooleanQuery();
BooleanQuery nameQueries = new BooleanQuery();
// always add a non-fuzzy query on each token.
List<String> tokens = getTokensFromAnalyzer(inputName);
for (String token : tokens) {
Query termQuery = getTermQuery(PluginConstant.FIRST_NAME_STANDARDIZE_NAME, token, false);
termQuery.setBoost(2);
nameQueries.add(termQuery, BooleanClause.Occur.SHOULD);
}
Query nameTermQuery = getTermQuery(PluginConstant.FIRST_NAME_STANDARDIZE_NAMETERM, inputName.toLowerCase(), fuzzySearch);
nameQueries.add(nameTermQuery, BooleanClause.Occur.SHOULD);
combinedQuery.add(nameQueries, BooleanClause.Occur.MUST);
if (countryText != null && !"".equals(countryText)) {//$NON-NLS-1$
Query countryQuery = getTermQuery(PluginConstant.FIRST_NAME_STANDARDIZE_COUNTRY, countryText, false);
countryQuery.setBoost(5);
combinedQuery.add(countryQuery, BooleanClause.Occur.SHOULD);
}
if (genderText != null && !"".equals(genderText)) {//$NON-NLS-1$
Query genderQuery = getTermQuery(PluginConstant.FIRST_NAME_STANDARDIZE_GENDER, genderText, false);
genderQuery.setBoost(5);
combinedQuery.add(genderQuery, BooleanClause.Occur.SHOULD);
}
TopDocs matches = searcher.search(combinedQuery, 10);
return matches.scoreDocs;
}
@SuppressWarnings("unused")
private TopDocsCollector<?> createTopDocsCollector() throws IOException {
// TODO the goal is to sort the result in descending order according to the "count" field
if (SORT_WITH_COUNT) { // TODO enable this when it works correctly
SortField sortfield = new SortField(PluginConstant.FIRST_NAME_STANDARDIZE_COUNT, SortField.Type.INT);
Sort sort = new Sort(sortfield);
// results are sorted according to a score and then to the count value
return TopFieldCollector.create(sort, hitsPerPage, false, false, false, false);
} else {
return TopScoreDocCollector.create(hitsPerPage, false);
}
}
/**
* Method "replaceName".
*
* @param input a first name
* @return the standardized first name
* @throws Exception
*/
public String replaceName(String inputName, boolean fuzzyQuery) throws IOException {
ScoreDoc[] results = standardize(inputName, null, fuzzyQuery);
return results.length == 0 ? "" : searcher.doc(results[0].doc).get("name");//$NON-NLS-1$ //$NON-NLS-2$
}
public String replaceNameWithCountryGenderInfo(String inputName, String inputCountry, String inputGender, boolean fuzzyQuery)
throws IOException {
Map<String, String> indexFields = new HashMap<String, String>();
indexFields.put("country", inputCountry);//$NON-NLS-1$
indexFields.put("gender", inputGender);//$NON-NLS-1$
ScoreDoc[] results = standardize(inputName, indexFields, fuzzyQuery);
return results.length == 0 ? "" : searcher.doc(results[0].doc).get("name");//$NON-NLS-1$ //$NON-NLS-2$
}
public String replaceNameWithCountryInfo(String inputName, String inputCountry, boolean fuzzyQuery) throws IOException {
Map<String, String> indexFields = new HashMap<String, String>();
indexFields.put("country", inputCountry);//$NON-NLS-1$
ScoreDoc[] results = standardize(inputName, indexFields, fuzzyQuery);
return results.length == 0 ? "" : searcher.doc(results[0].doc).get("name");//$NON-NLS-1$ //$NON-NLS-2$
}
public String replaceNameWithGenderInfo(String inputName, String inputGender, boolean fuzzyQuery) throws IOException {
Map<String, String> indexFields = new HashMap<String, String>();
indexFields.put("gender", inputGender);//$NON-NLS-1$
ScoreDoc[] results = standardize(inputName, indexFields, fuzzyQuery);
return results.length == 0 ? "" : searcher.doc(results[0].doc).get("name");//$NON-NLS-1$ //$NON-NLS-2$
}
public void setMaxEdits(int maxEdits) {
this.maxEdits = maxEdits;
}
}