package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.util.ArrayList; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.Version; /** * This tool splits input index into multiple equal parts. The method employed * here uses {@link IndexWriter#addIndexes(IndexReader[])} where the input data * comes from the input index with artificially applied deletes to the document * id-s that fall outside the selected partition. * <p>Note 1: Deletes are only applied to a buffered list of deleted docs and * don't affect the source index - this tool works also with read-only indexes. * <p>Note 2: the disadvantage of this tool is that source index needs to be * read as many times as there are parts to be created, hence the name of this * tool. */ public class MultiPassIndexSplitter { /** * Split source index into multiple parts. * @param input source index, can be read-only, can have deletions, can have * multiple segments (or multiple readers). * @param outputs list of directories where the output parts will be stored. * @param seq if true, then the source index will be split into equal * increasing ranges of document id-s. If false, source document id-s will be * assigned in a deterministic round-robin fashion to one of the output splits. * @throws IOException */ public void split(IndexReader input, Directory[] outputs, boolean seq) throws IOException { if (outputs == null || outputs.length < 2) { throw new IOException("Invalid number of outputs."); } if (input == null || input.numDocs() < 2) { throw new IOException("Not enough documents for splitting"); } int numParts = outputs.length; // wrap a potentially read-only input // this way we don't have to preserve original deletions because neither // deleteDocument(int) or undeleteAll() is applied to the wrapped input index. input = new FakeDeleteIndexReader(input); int maxDoc = input.maxDoc(); int partLen = maxDoc / numParts; for (int i = 0; i < numParts; i++) { input.undeleteAll(); if (seq) { // sequential range int lo = partLen * i; int hi = lo + partLen; // below range for (int j = 0; j < lo; j++) { input.deleteDocument(j); } // above range - last part collects all id-s that remained due to // integer rounding errors if (i < numParts - 1) { for (int j = hi; j < maxDoc; j++) { input.deleteDocument(j); } } } else { // round-robin for (int j = 0; j < maxDoc; j++) { if ((j + numParts - i) % numParts != 0) { input.deleteDocument(j); } } } IndexWriter w = new IndexWriter(outputs[i], new IndexWriterConfig( Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) .setOpenMode(OpenMode.CREATE)); System.err.println("Writing part " + (i + 1) + " ..."); w.addIndexes(new IndexReader[]{input}); w.close(); } System.err.println("Done."); } public static void main(String[] args) throws Exception { if (args.length < 5) { System.err.println("Usage: MultiPassIndexSplitter -out <outputDir> -num <numParts> [-seq] <inputIndex1> [<inputIndex2 ...]"); System.err.println("\tinputIndex\tpath to input index, multiple values are ok"); System.err.println("\t-out ouputDir\tpath to output directory to contain partial indexes"); System.err.println("\t-num numParts\tnumber of parts to produce"); System.err.println("\t-seq\tsequential docid-range split (default is round-robin)"); System.exit(-1); } ArrayList<IndexReader> indexes = new ArrayList<IndexReader>(); String outDir = null; int numParts = -1; boolean seq = false; for (int i = 0; i < args.length; i++) { if (args[i].equals("-out")) { outDir = args[++i]; } else if (args[i].equals("-num")) { numParts = Integer.parseInt(args[++i]); } else if (args[i].equals("-seq")) { seq = true; } else { File file = new File(args[i]); if (!file.exists() || !file.isDirectory()) { System.err.println("Invalid input path - skipping: " + file); continue; } Directory dir = FSDirectory.open(new File(args[i])); try { if (!IndexReader.indexExists(dir)) { System.err.println("Invalid input index - skipping: " + file); continue; } } catch (Exception e) { System.err.println("Invalid input index - skipping: " + file); continue; } indexes.add(IndexReader.open(dir, true)); } } if (outDir == null) { throw new Exception("Required argument missing: -out outputDir"); } if (numParts < 2) { throw new Exception("Invalid value of required argument: -num numParts"); } if (indexes.size() == 0) { throw new Exception("No input indexes to process"); } File out = new File(outDir); if (!out.mkdirs()) { throw new Exception("Can't create output directory: " + out); } Directory[] dirs = new Directory[numParts]; for (int i = 0; i < numParts; i++) { dirs[i] = FSDirectory.open(new File(out, "part-" + i)); } MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); IndexReader input; if (indexes.size() == 1) { input = indexes.get(0); } else { input = new MultiReader(indexes.toArray(new IndexReader[indexes.size()])); } splitter.split(input, dirs, seq); } /** * This class pretends that it can write deletions to the underlying index. * Instead, deletions are buffered in a bitset and overlaid with the original * list of deletions. */ public static class FakeDeleteIndexReader extends FilterIndexReader { // TODO: switch to flex api, here OpenBitSet dels; OpenBitSet oldDels = null; public FakeDeleteIndexReader(IndexReader in) { super(in); dels = new OpenBitSet(in.maxDoc()); if (in.hasDeletions()) { oldDels = new OpenBitSet(in.maxDoc()); final Bits oldDelBits = MultiFields.getDeletedDocs(in); assert oldDelBits != null; for (int i = 0; i < in.maxDoc(); i++) { if (oldDelBits.get(i)) oldDels.set(i); } dels.or(oldDels); } } @Override public int numDocs() { return in.maxDoc() - (int)dels.cardinality(); } /** * Just removes our overlaid deletions - does not undelete the original * deletions. */ @Override protected void doUndeleteAll() throws CorruptIndexException, IOException { dels = new OpenBitSet(in.maxDoc()); if (oldDels != null) { dels.or(oldDels); } } @Override protected void doDelete(int n) throws CorruptIndexException, IOException { dels.set(n); } @Override public boolean hasDeletions() { return !dels.isEmpty(); } @Override public IndexReader[] getSequentialSubReaders() { return null; } @Override public Bits getDeletedDocs() { return dels; } } }