// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.standardization.migration; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.CheckIndex.Status; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * A tool to regenerate all "out of the box" indexes with specified analyzer. The regeneration simply reads and * re-writes all detected indexes in inputPath. This class is independent from SynonymIndexBuilder class. * * @author sizhaoliu */ public class IndexMigrator { // Use standard analyzer without English stop words like "an", "was" private Analyzer analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); // Default value points to an SVN working copy. // The provided indexes are located at "addons" folder of the studio. //private String inputPath = "/misc/repo-td/tdq-studio-ee/main/plugins/org.talend.dataquality.data.resources/data/synonym";//$NON-NLS-1$ private String inputPath = "/Volumes/Macintosh/repo-td/tdq-studio-ee/main/plugins/org.talend.dataquality.data.index/TalendGivenNames_index"; //private String inputPath = "/path/to/studio/addons/data/synonym";//$NON-NLS-1$ private String outputPath = ""; public static final String F_WORD = "word"; public static final String F_SYN = "syn"; public static final String F_WORDTERM = "wordterm"; public static final String F_SYNTERM = "synterm"; private static final boolean IS_MIGRATING_FIRSTNAME_INDEX = true; private Map<String, List<String[]>> nameMap = new HashMap<String, List<String[]>>(); private int count = 0; /** * Sets the inputPath. * * @param inputPath the inputPath to set */ public void setInputPath(String inputPath) { this.inputPath = inputPath; } /** * Sets the outputPath. * * @param outputPath the outputPath to set */ public void setOutputPath(String outputPath) { this.outputPath = outputPath; } /** * Deletes all files and sub-directories under a specified directory. * * @param dir * @return true if all deletions were successful */ private boolean deleteDir(File dir) { if (dir.isDirectory()) { String[] children = dir.list(); for (int i = 0; i < children.length; i++) { boolean success = deleteDir(new File(dir, children[i])); if (!success) { return false; } } } return dir.delete(); } /** * prepare I/O folders and call regeneration process. * * @throws java.io.IOException */ public int run() throws IOException { File inputFolder = new File(inputPath); if (!inputFolder.exists() || !inputFolder.isDirectory()) { System.err.println("The input path <" + inputPath + "> does not exist or is not a folder."); System.err.println("Usage: java -jar IndexMigrator.jar <inputPath> <outputPath(optinal)>"); return -1; } File outputFolder = new File(outputPath); if (inputFolder.equals(outputFolder)) { System.err.println("The I/O path should not be identical."); return -2; } System.out.println("Migrating all indexes in folder <" + inputPath + ">"); if ("".equals(outputPath)) { System.out.println("No output folder specified. The new index(es) will be genenrated in <" + inputPath + "_REGENERATED> folder"); outputFolder = new File(inputPath + "_REGENERATED"); } else { outputFolder = new File(outputPath); } if (outputFolder.exists() && outputFolder.isDirectory()) { System.out.println("The path <" + outputFolder + "> already exists.\nDeleting before migration..."); deleteDir(outputFolder); } return regenerate(inputFolder, outputFolder); } /** * regenerate all indexes recursively. * * @param inputFolder * @param outputFolder * @throws java.io.IOException */ private int regenerate(File inputFolder, File outputFolder) throws IOException { FSDirectory indexDir = FSDirectory.open(inputFolder); CheckIndex check = new CheckIndex(indexDir); Status status = check.checkIndex(); if (status.missingSegments) { for (File f : inputFolder.listFiles()) { if (f.isDirectory()) { File out = new File(outputFolder.getAbsolutePath() + "/" + f.getName()); out.mkdir(); regenerate(f, out); } } } else { System.out.println("REGENERATE: " + inputFolder.getAbsoluteFile()); FSDirectory outputDir = FSDirectory.open(outputFolder); IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer); IndexWriter writer = new IndexWriter(outputDir, config); // IndexWriter writer = new IndexWriter(outputDir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); IndexReader reader = DirectoryReader.open(indexDir); // IndexSearcher searcher = new IndexSearcher(reader); // IndexSearcher searcher = new IndexSearcher(indexDir); Document doc = null; // for any other indexes, regenerate with new Analyzer, but no // changes to document. for (int i = 0; i < reader.maxDoc(); i++) { doc = reader.document(i); if (IS_MIGRATING_FIRSTNAME_INDEX) { Document newDoc = generateFirstNameDoc(doc); if (newDoc != null) { writer.addDocument(newDoc); } } else { writer.addDocument(doc); } } System.out.println("count: " + count); writer.commit(); writer.close(); outputDir.close(); // copy all other files such as "readMe.txt" for (File file : inputFolder.listFiles()) { if (file.isFile() && !isLuceneIndexFile(file)) { // copy to destination folder copyFile(file, outputFolder); } } } return 0; } private Document generateFirstNameDoc(Document doc) { String name = doc.get("name");//$NON-NLS-1$ String country = doc.get("country");//$NON-NLS-1$ String gender = doc.get("gender");//$NON-NLS-1$ List<String[]> variants = nameMap.get(name); if (variants != null) { // see if the current doc is duplicated for (String[] tuple : variants) { if ((country == null && tuple[0] == null || country != null && country.equals(tuple[0]))// && (gender == null && tuple[1] == null || gender != null && gender.equals(tuple[1]))) { return null; } } // return null; } else { variants = new ArrayList<String[]>(); } variants.add(new String[] { country, gender }); nameMap.put(name, variants); count++; // TODO Auto-generated method stub return generateDocument(name, country, gender); } /** * generate a document. * * @param word * @param synonyms * @return */ private Document generateDocument(String name, String country, String gender) { name = name.trim(); Document doc = new Document(); FieldType ft = new FieldType(); ft.setStored(true); ft.setIndexed(true); ft.setOmitNorms(true); ft.freeze(); Field wordField = new Field("name", name, ft); doc.add(wordField); Field wordTermField = new StringField("nameterm", name.toLowerCase(), Field.Store.NO); doc.add(wordTermField); if (country != null) { Field countryField = new StringField("country", country, Field.Store.YES); doc.add(countryField); } if (gender != null) { Field genderField = new StringField("gender", gender, Field.Store.YES); doc.add(genderField); } return doc; } /** * check if a file is for Lucene index. A complete list of lucene index formats can be found here: * * http://lucene.apache.org/core/old_versioned_docs/versions/3_0_1/ fileformats.html * * @param file */ private boolean isLuceneIndexFile(File file) { String fileName = file.getName(); if (fileName.startsWith("segments") || "write.lock".equals(fileName) || fileName.endsWith(".cfs") || fileName.endsWith(".fnm") || fileName.endsWith(".fdx") || fileName.endsWith(".fdt") || fileName.endsWith(".tis") || fileName.endsWith(".tii") || fileName.endsWith(".frq") || fileName.endsWith(".prx") || fileName.endsWith(".nrm") || fileName.endsWith(".tvx") || fileName.endsWith(".tvd") || fileName.endsWith(".tvf") || fileName.endsWith(".del")) { return true; } return false; } private void copyFile(File source, File targetFolder) throws IOException { if (source.isDirectory()) { if (!".svn".equals(source.getName())) { // omit SVN metadata File dir = new File(targetFolder.getAbsolutePath() + "/" + source.getName()); dir.mkdirs(); for (File f : source.listFiles()) { copyFile(f, dir); } } } else { FileInputStream fis = new FileInputStream(source); FileOutputStream fos = null; try { if (!targetFolder.exists()) { targetFolder.mkdirs(); } fos = new FileOutputStream(targetFolder + "/" + source.getName()); byte[] buf = new byte[1024]; int i = 0; while ((i = fis.read(buf)) != -1) { fos.write(buf, 0, i); } } finally { try { fis.close(); } catch (Exception e) { } try { fos.close(); } catch (Exception e) { } } } } public static void main(String[] args) throws IOException { IndexMigrator migration = new IndexMigrator(); if (args.length > 0) { String inputPath = args[0]; migration.setInputPath(inputPath); if (args.length > 1) { String outputPath = args[1]; migration.setOutputPath(outputPath); } } int status = migration.run(); System.exit(status); } }