/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.logging.Logger;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter; // javadocs
import org.apache.lucene.store.*;
import org.apache.lucene.util.Version;
/** Sort an index by document importance factor. Higher scoring documents are
* assigned smaller document numbers. Document weights are obtained from a
* specified field, which has to be single-valued and stored, with string value
* that represents a float number. Stored fields in the output index remain
* consistent, i.e. both stored fields and postings are renumbered in sync.
*
* <p><b>NOTE</b>: this tool is unaware of documents added
* atomically via {@link IndexWriter#addDocuments} or {@link
* IndexWriter#updateDocuments}, which means it can easily
* break up such document groups.
*/
public class IndexSorter {
private static final Logger LOG = Logger.getLogger(IndexSorter.class.getName());
private static class PostingMap implements Comparable<PostingMap> {
private int newDoc;
private long offset;
public int compareTo(PostingMap pm) { // order by newDoc id
return this.newDoc - pm.newDoc;
}
}
private static class SortedTermPositions implements TermPositions {
private TermPositions original;
private int[] oldToNew;
private int docFreq;
private PostingMap[] postingMaps = new PostingMap[0];
private int pointer;
private int freq;
private int position;
private static final String TEMP_FILE = "temp";
private final RAMDirectory tempDir = new RAMDirectory();
private RAMOutputStream out;
private IndexInput in;
public SortedTermPositions(TermPositions original, int[] oldToNew) {
this.original = original;
this.oldToNew = oldToNew;
try {
out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
} catch (IOException ioe) {
LOG.warning("Error creating temporary output: " + ioe);
}
}
public void seek(Term term) throws IOException {
throw new UnsupportedOperationException();
}
public void seek(TermEnum terms) throws IOException {
original.seek(terms);
docFreq = terms.docFreq();
pointer = -1;
if (docFreq > postingMaps.length) { // grow postingsMap
PostingMap[] newMap = new PostingMap[docFreq];
System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
for (int i = postingMaps.length; i < docFreq; i++) {
newMap[i] = new PostingMap();
}
postingMaps = newMap;
}
out.reset();
int i = 0;
while (original.next()) {
PostingMap map = postingMaps[i++];
map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
map.offset = out.getFilePointer(); // save pointer to buffer
final int tf = original.freq(); // buffer tf & positions
out.writeVInt(tf);
int prevPosition = 0;
for (int j = tf; j > 0; j--) { // delta encode positions
int p = original.nextPosition();
out.writeVInt(p - prevPosition);
prevPosition = p;
}
}
out.flush();
docFreq = i; // allow for deletions
Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
// NOTE: this might be substantially faster if RAMInputStream were public
// and supported a reset() operation.
in = tempDir.openInput(TEMP_FILE);
}
public boolean next() throws IOException {
pointer++;
if (pointer < docFreq) {
in.seek(postingMaps[pointer].offset);
freq = in.readVInt();
position = 0;
return true;
}
return false;
}
public int doc() { return postingMaps[pointer].newDoc; }
public int freq() { return freq; }
public int nextPosition() throws IOException {
int positionIncrement = in.readVInt();
position += positionIncrement;
return position;
}
public int read(int[] docs, int[] freqs) {
throw new UnsupportedOperationException();
}
public boolean skipTo(int target) {
throw new UnsupportedOperationException();
}
public byte[] getPayload(byte[] data, int offset) throws IOException {
return null;
}
public int getPayloadLength() {
return 0;
}
public boolean isPayloadAvailable() {
return false;
}
public void close() throws IOException {
original.close();
}
}
private static class SortingReader extends FilterIndexReader {
private int[] oldToNew;
private int[] newToOld;
public SortingReader(IndexReader oldReader, int[] oldToNew) {
super(oldReader);
this.oldToNew = oldToNew;
this.newToOld = new int[oldReader.maxDoc()];
int oldDoc = 0;
while (oldDoc < oldToNew.length) {
int newDoc = oldToNew[oldDoc];
if (newDoc != -1) {
newToOld[newDoc] = oldDoc;
}
oldDoc++;
}
}
@Override
public IndexReader[] getSequentialSubReaders() {
return null;
}
@Override
public Document document(int n) throws IOException {
return document(n, null);
}
@Override
public Document document(int n, FieldSelector fieldSelector)
throws CorruptIndexException, IOException {
return super.document(newToOld[n], fieldSelector);
}
@Override
public boolean isDeleted(int n) {
return false;
}
@Override
public byte[] norms(String f) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void norms(String f, byte[] norms, int offset) throws IOException {
byte[] oldNorms = super.norms(f);
int oldDoc = 0;
while (oldDoc < oldNorms.length) {
int newDoc = oldToNew[oldDoc];
if (newDoc != -1) {
norms[newDoc] = oldNorms[oldDoc];
}
oldDoc++;
}
}
@Override
protected void doSetNorm(int d, String f, byte b) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public TermDocs termDocs() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public TermPositions termPositions() throws IOException {
return new SortedTermPositions(super.termPositions(), oldToNew);
}
@Override
public TermFreqVector[] getTermFreqVectors(int docNumber)
throws IOException {
return super.getTermFreqVectors(newToOld[docNumber]);
}
@Override
protected void doDelete(int n) throws IOException {
throw new UnsupportedOperationException();
}
}
private static class DocScore implements Comparable<DocScore> {
private int oldDoc;
private float score;
public int compareTo(DocScore that) { // order by score, oldDoc
if (this.score == that.score) {
return this.oldDoc - that.oldDoc;
} else {
return this.score < that.score ? 1 : -1 ;
}
}
@Override
public String toString() {
return "oldDoc=" + oldDoc + ",score=" + score;
}
}
public IndexSorter() {
}
public void sort(Directory input, Directory output, String field) throws IOException {
LOG.info("IndexSorter: starting.");
long start = System.currentTimeMillis();
IndexReader reader = IndexReader.open(input, true);
SortingReader sorter = new SortingReader(reader, oldToNew(reader, field));
IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, new WhitespaceAnalyzer(Version.LUCENE_31));
IndexWriter writer = new IndexWriter(output, cfg);
writer.addIndexes(new IndexReader[] { sorter });
writer.close();
long end = System.currentTimeMillis();
LOG.info("IndexSorter: done, " + (end - start)
+ " total milliseconds");
}
private static int[] oldToNew(IndexReader reader, String field) throws IOException {
int readerMax = reader.maxDoc();
DocScore[] newToOld = new DocScore[readerMax];
FieldSelector fSel = new MapFieldSelector(field);
for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
float score;
if (reader.isDeleted(oldDoc)) {
score = 0.0f;
} else {
Document d = reader.document(oldDoc, fSel);
try {
score = Float.parseFloat(d.get(field));
} catch (Exception e) {
score = 0.0f;
}
}
DocScore docScore = new DocScore();
docScore.oldDoc = oldDoc;
docScore.score = score;
newToOld[oldDoc] = docScore;
}
Arrays.sort(newToOld);
int[] oldToNew = new int[readerMax];
for (int newDoc = 0; newDoc < readerMax; newDoc++) {
DocScore docScore = newToOld[newDoc];
oldToNew[docScore.oldDoc] = newDoc;
}
return oldToNew;
}
/** */
public static void main(String[] args) throws Exception {
Directory input, output;
String field;
String usage = "IndexSorter <input> <output> <field>";
if (args.length < 3) {
System.err.println("Usage: " + usage);
System.exit(-1);
}
input = FSDirectory.open(new File(args[0]));
File out = new File(args[1]);
if (!out.exists()) out.mkdirs();
output = FSDirectory.open(out);
field = args[2];
IndexSorter sorter = new IndexSorter();
try {
sorter.sort(input, output, field);
} catch (Exception e) {
LOG.warning("IndexSorter: " + e);
}
}
}