// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.standardization.index;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import org.talend.dataquality.standardization.i18n.Messages;
import com.talend.csv.CSVReader;
/**
* DOC scorreia class global comment. Detailed comment
*/
// TODO move the main method and related methods into the test project.
public class IndexBuilder {
private static final Logger LOG = Logger.getLogger(SynonymIndexBuilder.class);
private String directoryPath;
private Directory index;
/**
* Getter for index.
*
* @return the index
*/
public Directory getIndex() {
return this.index;
}
public IndexBuilder(String directoryPath) {
this.directoryPath = directoryPath;
}
public IndexBuilder() {
// needn't to do anything
}
public boolean initializeIndex(String csvFileToIndex, int[] columnsToBeIndexed) throws IOException {
assert csvFileToIndex != null;
if (!new File(csvFileToIndex).exists() || !new File(directoryPath).isDirectory()) {
throw new IOException(Messages.getString("IndexBuilder.error", csvFileToIndex, directoryPath));//$NON-NLS-1$
}
index = new MMapDirectory(new File(directoryPath));
// The same analyzer should be used for indexing and searching
Analyzer analyzer = new StandardAnalyzer();
// the boolean arg in the IndexWriter ctor means to
// create a new index, overwriting any existing index
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
IndexWriter w = new IndexWriter(index, config);
// read the data (this will be the input data of a component called tFirstNameStandardize)
CSVReader csvReader = createCSVReader(csvFileToIndex, ',');
while (csvReader.readNext()) {
String name = csvReader.get(columnsToBeIndexed[0]);
String country = csvReader.get(columnsToBeIndexed[1]);
String gender = csvReader.get(columnsToBeIndexed[2]);
String count = csvReader.get(columnsToBeIndexed[3]);
addDoc(w, name, country, gender, count);
}
csvReader.close();
w.commit();
w.close();
return true;
}
private static void addDoc(IndexWriter w, String name, String country, String gender, String count) throws IOException {
if (!"".equals(country) && !"".equals(gender)) {//$NON-NLS-1$ //$NON-NLS-2$
Document doc = new Document();
Field field = new Field("name", name, Field.Store.YES, Field.Index.ANALYZED, TermVector.YES);//$NON-NLS-1$
doc.add(field);
doc.add(new Field("country", country, Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.YES));//$NON-NLS-1$
doc.add(new Field("gender", gender, Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.YES));//$NON-NLS-1$
doc.add(new Field("count", count, Field.Store.NO, Field.Index.NOT_ANALYZED, TermVector.NO));//$NON-NLS-1$
w.addDocument(doc);
}
}
public boolean initializeSynonymIndex(String csvFileToIndex, int[] columnsToBeIndexed) throws IOException {
assert csvFileToIndex != null;
if (!new File(csvFileToIndex).exists() || !new File(directoryPath).isDirectory()) {
throw new IOException(Messages.getString("IndexBuilder.error", csvFileToIndex, directoryPath));//$NON-NLS-1$
}
index = FSDirectory.open(new File(directoryPath));
// The same analyzer should be used for indexing and searching
Analyzer analyzer = new StandardAnalyzer();
// the boolean arg in the IndexWriter ctor means to
// create a new index, overwriting any existing index
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
IndexWriter w = new IndexWriter(index, config);
// read the data (this will be the input data of a component called
// tFirstNameStandardize)
CSVReader csvReader = createCSVReader(csvFileToIndex, ';');
while (csvReader.readNext()) {
Document doc = new Document();
String word = csvReader.get(columnsToBeIndexed[0]);
doc.add(new Field("word", word, Field.Store.YES, Field.Index.NO, TermVector.NO));//$NON-NLS-1$
doc.add(new Field("syn", word, Field.Store.YES, Field.Index.ANALYZED, TermVector.YES));//$NON-NLS-1$
if (columnsToBeIndexed.length == 1) {
w.addDocument(doc);
continue;
}
String synonyms = csvReader.get(columnsToBeIndexed[1]);
String[] split = StringUtils.split(synonyms, "|");//$NON-NLS-1$
for (String str : split) {
doc.add(new Field("syn", str, Field.Store.YES, Field.Index.ANALYZED, TermVector.YES));//$NON-NLS-1$
}
w.addDocument(doc);
}
csvReader.close();
w.commit();
w.close();
return true;
}
/**
* DOC yyin Comment method "createCSVReader".
*
* @param csvFileToIndex
* @return
* @throws UnsupportedEncodingException
* @throws FileNotFoundException
* @throws IOException
*/
private CSVReader createCSVReader(String csvFileToIndex, char seperator)
throws UnsupportedEncodingException, FileNotFoundException, IOException {
CSVReader csvReader = new CSVReader(
new java.io.BufferedReader(
new java.io.InputStreamReader(new java.io.FileInputStream(csvFileToIndex.toString()), "windows-1252")), //$NON-NLS-1$
seperator);
csvReader.setQuoteChar('\"');
csvReader.readNext();// skip header
return csvReader;
}
private static void createSynonymIndex(String indexPath, String sourceFile) {
File dirFile = new File("data/indexes/" + indexPath);//$NON-NLS-1$
if (!(dirFile.exists()) && !(dirFile.isDirectory())) {
dirFile.mkdirs();
}
IndexBuilder ib = new IndexBuilder("data/indexes/" + indexPath);//$NON-NLS-1$
int[] columnsToIndex = { 0, 1 };
try {
ib.initializeSynonymIndex("data/indexes/" + sourceFile, columnsToIndex);//$NON-NLS-1$
} catch (IOException e) {
LOG.error(e);
}
}
private static void createSingleIndex(String indexPath, String sourceFile) {
File dirFile = new File("data/indexes/" + indexPath);//$NON-NLS-1$
if (!(dirFile.exists()) && !(dirFile.isDirectory())) {
dirFile.mkdirs();
}
IndexBuilder ib = new IndexBuilder("data/indexes/" + indexPath);//$NON-NLS-1$
int[] columnsToIndex = { 0 };
try {
ib.initializeSynonymIndex("data/indexes/" + sourceFile, columnsToIndex);//$NON-NLS-1$
} catch (IOException e) {
LOG.error(e);
}
}
public static void main(String[] args) throws IOException {
createSynonymIndex("Salutory", "SalutorySynonyms.csv");//$NON-NLS-1$ //$NON-NLS-2$
createSynonymIndex("Address", "AddressSynonyms.csv");//$NON-NLS-1$ //$NON-NLS-2$
createSingleIndex("Company", "CompanySynonyms.csv");//$NON-NLS-1$ //$NON-NLS-2$
createSynonymIndex("StreetType", "StreetTypeSynonyms.csv");//$NON-NLS-1$ //$NON-NLS-2$
}
}