package org.cdlib.xtf.textIndexer; /** * Copyright (c) 2006, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Acknowledgements: * * A significant amount of new and/or modified code in this module * was made possible by a grant from the Andrew W. Mellon Foundation, * as part of the Melvyl Recommender Project. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Vector; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.cdlib.xtf.textEngine.NativeFSDirectory; import org.cdlib.xtf.util.Path; import org.cdlib.xtf.util.Trace; /** * This class merges the contents of two or more XTF indexes, with certain * caveats. * * @author Martin Haye */ public class IndexMerge { ////////////////////////////////////////////////////////////////////////////// /** Main entry-point for the index merger. <br><br> * * This function takes the command line arguments passed and uses them to * find the indexes and merge them. */ public static void main(String[] args) { Trace.info("IndexMerge v. 2.2"); try { IndexerConfig cfgInfo = new IndexerConfig(); XMLConfigParser cfgParser = new XMLConfigParser(); int startArg = 0; boolean showUsage = false; // Make sure the XTF_HOME environment variable is specified. cfgInfo.xtfHomePath = System.getProperty("xtf.home"); if (cfgInfo.xtfHomePath == null || cfgInfo.xtfHomePath.length() == 0) { Trace.error("Error: xtf.home property not found"); return; } cfgInfo.xtfHomePath = Path.normalizePath(cfgInfo.xtfHomePath); if (!new File(cfgInfo.xtfHomePath).isDirectory()) { Trace.error( "Error: xtf.home directory \"" + cfgInfo.xtfHomePath + "\" does not exist or cannot be read."); return; } // Parse the command-line arguments. Vector mergePaths = new Vector(); HashSet pathSet = new HashSet(); while (!showUsage && startArg < args.length) { // The minimum set of arguments consists of the name of an index // to read and an output index. That requires four arguments; if // we don't get that many, we will show the usage text and bail. // if (args.length < 4) showUsage = true; // We have enough arguments, so... else { // Read the command line arguments until we find what we // need to do some work, or until we run out. // int ret = cfgInfo.readCmdLine(args, startArg); // If we didn't find enough command line arguments... if (ret == -1) { // And this is the first time we've visited the command // line arguments, avoid trying to doing work and just // display the usage text. Otherwise, we're done. // if (startArg == 0) showUsage = true; else break; } // if( ret == -1 ) // We did find enough command line arguments, so... // else { // Make sure the configuration path is absolute if (!(new File(cfgInfo.cfgFilePath).isAbsolute())) { cfgInfo.cfgFilePath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath, cfgInfo.cfgFilePath); } // Get the configuration for the index specified by the // current command line arguments. // if (cfgParser.configure(cfgInfo) < 0) { Trace.error( "Error: index '" + cfgInfo.indexInfo.indexName + "' not found\n"); System.exit(1); } } // else( ret != -1 ) // Save the new start argument for the next time. startArg = ret; // The indexes should all be in different directories. IndexInfo idxInfo = cfgInfo.indexInfo; String idxPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath, idxInfo.indexPath); if (pathSet.contains(idxPath)) { Trace.error( "Error: indexes to be merged must be in separate directories."); System.exit(1); } pathSet.add(idxPath); // Save the index info for the merge mergePaths.add(idxPath); } // else } // for // At least two indexes should be specified. if (mergePaths.size() < 2) showUsage = true; // Show usage message if any problems found. if (showUsage) { // Do so... Trace.error(" usage: "); Trace.tab(); Trace.error( "indexMerge -config <config1> -index <index1> " + "-config <config2> -index <inputIndex2> ...\n\n" + "Sample use:\n" + "1. Make two config files, each with different input and output directories.\n" + "2. textIndexer -config conf/textIndexer.conf1 -clean -noupdatespell -index default\n" + "3. textIndexer -config conf/textIndexer.conf2 -clean -noupdatespell -index default\n" + "4. indexMerge -config conf/textIndexer.conf1 -index default -config conf/textIndexer.conf2 -index default\n\n" + "This *experimental* command merges data from all the specified indexes into the \n" + "first index.\n\n"); Trace.untab(); // And then bail. System.exit(1); } // if( showUsage ) // Make sure all the indexes exist, and that their parameters are at // least minimally compatible. // DirInfo[] dirInfos = new DirInfo[mergePaths.size()]; boolean createTarget = false; for (int i = 0; i < mergePaths.size(); i++) { String idxPath = (String)mergePaths.get(i); if (!IndexReader.indexExists(idxPath)) { // It's okay if the target index doesn't exist. if (i == 0) { createTarget = true; dirInfos[0] = new DirInfo(idxPath, null); continue; } throw new RuntimeException( "Error: Cannot locate index in directory '" + idxPath + "'"); } Directory srcDir = NativeFSDirectory.getDirectory(idxPath); dirInfos[i] = readInfo(idxPath, srcDir); // Check for parameter compatibility if ((i == 1 && !createTarget) || (i > 1)) { if (dirInfos[i].chunkOverlap != dirInfos[i - 1].chunkOverlap) throw new RuntimeException( "Error: index parameters must match exactly (chunkOverlap mismatch detected)"); if (dirInfos[i].chunkSize != dirInfos[i - 1].chunkSize) throw new RuntimeException( "Error: index parameters must match exactly (chunkSize mismatch detected)"); if (!dirInfos[i].stopWords.equals(dirInfos[i - 1].stopWords)) throw new RuntimeException( "Error: index parameters must match exactly (stopWords mismatch detected)"); if (!dirInfos[i].accentMapName.equals(dirInfos[i - 1].accentMapName)) throw new RuntimeException( "Error: index parameters must match exactly (accentMapName mismatch detected)"); if (!dirInfos[i].pluralMapName.equals(dirInfos[i - 1].pluralMapName)) throw new RuntimeException( "Error: index parameters must match exactly (pluralMapName mismatch detected)"); } } // for // Well, enough preparing. Let's do the job we were sent to do. doMerge(dirInfos, createTarget); } // try // Log any unhandled exceptions. catch (Exception e) { Trace.error("*** Last Chance Exception: " + e.getClass()); Trace.error(" With message: " + e.getMessage()); Trace.error(""); e.printStackTrace(System.out); System.exit(1); } catch (Throwable t) { Trace.error("*** Last Chance Exception: " + t.getClass()); Trace.error(" With message: " + t); Trace.error(""); t.printStackTrace(System.out); System.exit(1); } // Exit successfully. System.exit(0); } // main() ////////////////////////////////////////////////////////////////////////////// private static DirInfo readInfo(String path, Directory dir) throws IOException { IndexReader indexReader = IndexReader.open(dir); try { // Fetch the chunk size and overlap from the index. Hits match = new IndexSearcher(indexReader).search( new TermQuery(new Term("indexInfo", "1"))); if (match.length() == 0) throw new IOException("Index missing indexInfo doc"); assert match.id(0) == 0 : "indexInfo chunk must be first in index"; Document doc = match.doc(0); // Pick out all the info we need. DirInfo ret = new DirInfo(path, dir); ret.chunkSize = Integer.parseInt(doc.get("chunkSize")); ret.chunkOverlap = Integer.parseInt(doc.get("chunkOvlp")); ret.stopWords = doc.get("stopWords"); ret.pluralMapName = doc.get("pluralMap"); ret.accentMapName = doc.get("accentMap"); return ret; } finally { indexReader.close(); } } ////////////////////////////////////////////////////////////////////////////// /** * Merge a bunch of indexes together. */ private static void doMerge(DirInfo[] dirInfos, boolean createTarget) throws InterruptedException, IOException { long startTime = System.currentTimeMillis(); // Let the user know what's about to occur. Trace.info("Ready to merge data from the following index directories:"); Trace.tab(); for (int i = 1; i < dirInfos.length; i++) Trace.info(dirInfos[i].path); Trace.untab(); Trace.info("into (and including data from) index directory:"); Trace.tab(); Trace.info(dirInfos[0].path); Trace.untab(); // Give them time to abort without consequences. Trace.info(""); Trace.info("Merge will begin in 5 seconds ... "); Thread.sleep(5000); Trace.info(""); Trace.info("Merging indexes ... "); Trace.tab(); // Open the writer for the target Lucene index IndexWriter writer = new IndexWriter(NativeFSDirectory.getDirectory(dirInfos[0].path), new StandardAnalyzer(), createTarget); // Merge each piece (spelling, lazy files, main indexes) mergeSpelling(dirInfos); mergeLazy(dirInfos); mergeAux(dirInfos); mergeLucene(writer, dirInfos); // All done. Report how long we spent. Trace.untab(); long timeMsec = System.currentTimeMillis() - startTime; long timeSec = timeMsec / 1000; long timeMin = timeSec / 60; long timeHour = timeMin / 60; Trace.info("Total time: "); if (timeHour > 0) { String ending = (timeHour == 1) ? "" : "s"; Trace.more(Trace.info, timeHour + " hour" + ending + ", "); } if (timeMin > 0) { String ending = ((timeMin % 60) == 1) ? "" : "s"; Trace.more(Trace.info, (timeMin % 60) + " minute" + ending + ", "); } String ending = ((timeSec % 60) == 1) ? "" : "s"; Trace.more(Trace.info, (timeSec % 60) + " second" + ending + "."); Trace.info("Merge completed successfully."); Trace.info(""); } // doMerge() ////////////////////////////////////////////////////////////////////////////// private static void mergeSpelling(DirInfo[] dirInfos) throws IOException { // If there are none to do, skip this step. boolean anyToDo = false; for (int i = 1; i < dirInfos.length; i++) { String sourceDir = dirInfos[i].path; File sourceFile = new File(sourceDir + "spellDict/newWords.txt"); if (!sourceFile.isFile() && sourceFile.canRead()) continue; anyToDo = true; } if (!anyToDo) return; Trace.info("Processing spellcheck word lists ... "); // Append each input file. for (int i = 1; i < dirInfos.length; i++) { String sourceDir = dirInfos[i].path; File sourceFile = new File(sourceDir + "spellDict/newWords.txt"); if (!sourceFile.isFile() && sourceFile.canRead()) continue; // Open the target file. String targetDir = dirInfos[0].path; Path.createPath(targetDir + "spellDict"); File targetFile = new File(targetDir + "spellDict/newWords.txt"); PrintWriter targetWriter = new PrintWriter( new BufferedWriter(new OutputStreamWriter(new FileOutputStream( targetFile, targetFile.isFile()), "UTF-8"))); BufferedReader sourceReader = new BufferedReader( new InputStreamReader(new FileInputStream(sourceFile), "UTF-8")); boolean eof = false; while (!eof) { try { String word = sourceReader.readLine(); if (word == null) eof = true; else targetWriter.println(word); } catch (EOFException e) { eof = true; } catch (IOException e) { Trace.warning( "Warning: Exception encountered (may be due to unfinished index): " + e); eof = true; } } sourceReader.close(); targetWriter.close(); } // for Trace.more("Done."); } // mergeSpelling() ////////////////////////////////////////////////////////////////////////////// private static void mergeAux(DirInfo[] dirInfos) throws IOException { // If there are none to do, skip this step. boolean anyToDo = false; for (int i = 1; i < dirInfos.length; i++) { String sourceDir = dirInfos[i].path; File accentFile = new File(sourceDir + dirInfos[i].accentMapName); File pluralFile = new File(sourceDir + dirInfos[i].pluralMapName); File tokFldFile = new File(sourceDir + "tokenizedFields.txt"); if (accentFile.canRead() || pluralFile.canRead() || tokFldFile.canRead()) anyToDo = true; } if (!anyToDo) return; Trace.info("Processing auxiliary files ... "); // Copy files from each directory... for (int i = 1; i < dirInfos.length; i++) { File accentSrc = new File(dirInfos[i].path, dirInfos[i].accentMapName); File accentDst = new File(dirInfos[0].path, dirInfos[i].accentMapName); if (accentSrc.canRead() && !accentDst.canRead()) Path.copyFile(accentSrc, accentDst); File pluralSrc = new File(dirInfos[i].path, dirInfos[i].pluralMapName); File pluralDst = new File(dirInfos[0].path, dirInfos[i].pluralMapName); if (pluralSrc.canRead() && !pluralDst.canRead()) Path.copyFile(pluralSrc, pluralDst); File tokFldSrc = new File(dirInfos[i].path, "tokenizedFields.txt"); File tokFldDst = new File(dirInfos[0].path, "tokenizedFields.txt"); if (tokFldSrc.canRead() && !tokFldDst.canRead()) mergeTokFldFiles(tokFldSrc, tokFldDst); else if (tokFldSrc.canRead() && tokFldDst.canRead()) mergeTokFldFiles(tokFldSrc, tokFldDst); } // for Trace.more("Done."); } // mergeAux() ////////////////////////////////////////////////////////////////////////////// private static void mergeTokFldFiles(File file1, File file2) throws IOException { LinkedHashSet set = new LinkedHashSet(); // Read in the first file BufferedReader reader = new BufferedReader(new FileReader(file1)); String line; while ((line = reader.readLine()) != null) set.add(line); reader.close(); // Add entries from the second file FileWriter writer = new FileWriter(file1, true /*append*/); reader = new BufferedReader(new FileReader(file2)); while ((line = reader.readLine()) != null) { if (!set.contains(line)) writer.append(line + "\n"); } writer.close(); } ////////////////////////////////////////////////////////////////////////////// private static void mergeLazy(DirInfo[] dirInfos) throws IOException { // Get the target lazy directory. String targetDir = dirInfos[0].path; // See if there are any source directories to merge. boolean anyToDo = false; for (int i = 1; i < dirInfos.length; i++) { String sourceDir = dirInfos[i].path; File lazyDir = new File(sourceDir, "lazy"); if (lazyDir.isDirectory()) anyToDo = true; } if (!anyToDo) return; Trace.info("Processing lazy tree files ... "); // Process each source directory. for (int i = 1; i < dirInfos.length; i++) { String sourceDir = dirInfos[i].path; mergeLazy(new File(sourceDir, "lazy"), new File(targetDir, "lazy")); } // for Trace.more("Done."); } // mergeLazy() ////////////////////////////////////////////////////////////////////////////// private static void mergeLazy(File src, File dst) throws IOException { // If the source is a file, copy it. if (src.isFile()) { // If the target file already exists, don't overwrite. if (dst.isFile()) return; // Copy away. Path.copyFile(src, dst); return; } // If the source is a directory, create the corresponding target // directory, and copy the files and sub-directories. // if (src.isDirectory()) { if (!dst.isDirectory()) { if (!Path.createPath(dst.toString())) throw new IOException("Error creating lazy file directory '" + dst + "'"); } // Process each sub-file String[] subFiles = src.list(); for (int i = 0; i < subFiles.length; i++) { mergeLazy(new File(src, subFiles[i]), new File(dst, subFiles[i])); } // for } // if } // mergeLazy() ////////////////////////////////////////////////////////////////////////////// private static void mergeLucene(IndexWriter writer, DirInfo[] dirInfos) throws IOException { Trace.info("Processing Lucene indexes (can be very time-consuming) ... "); Directory[] dirs = new Directory[dirInfos.length - 1]; for (int i = 1; i < dirInfos.length; i++) dirs[i - 1] = dirInfos[i].dir; writer.addIndexes(dirs); writer.optimize(); writer.close(); Trace.more("Done."); } // mergeLucene() ////////////////////////////////////////////////////////////////////////////// private static class DirInfo { public DirInfo(String idxPath, Directory srcDir) { this.path = idxPath; this.dir = srcDir; } String path; Directory dir; int chunkSize; int chunkOverlap; String stopWords; String pluralMapName; String accentMapName; } // class DirInfo } // class IndexMerge