package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
/**
* Filter to remove duplicate values from search results.
* <p>
* WARNING: for this to work correctly, you may have to wrap
* your reader as it cannot current deduplicate across different
* index segments.
*
* @see SlowCompositeReaderWrapper
*/
public class DuplicateFilter extends Filter {
// TODO: make duplicate filter aware of ReaderContext such that we can
// filter duplicates across segments
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
public enum KeepMode {
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
}
private KeepMode keepMode;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* <p/>
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read DocsEnum for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*/
public enum ProcessingMode {
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
}
private ProcessingMode processingMode;
private String fieldName;
public DuplicateFilter(String fieldName) {
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
return fastBits(context.reader(), acceptDocs);
} else {
return correctBits(context.reader(), acceptDocs);
}
}
private FixedBitSet correctBits(AtomicReader reader, Bits acceptDocs) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
Terms terms = reader.fields().terms(fieldName);
if (terms == null) {
return bits;
}
TermsEnum termsEnum = terms.iterator(null);
DocsEnum docs = null;
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE);
int doc = docs.nextDoc();
if (doc != DocIdSetIterator.NO_MORE_DOCS) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
bits.set(doc);
} else {
int lastDoc = doc;
while (true) {
lastDoc = doc;
doc = docs.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
bits.set(lastDoc);
}
}
}
}
return bits;
}
private FixedBitSet fastBits(AtomicReader reader, Bits acceptDocs) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
bits.set(0, reader.maxDoc()); //assume all are valid
Terms terms = reader.fields().terms(fieldName);
if (terms == null) {
return bits;
}
TermsEnum termsEnum = terms.iterator(null);
DocsEnum docs = null;
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
if (termsEnum.docFreq() > 1) {
// unset potential duplicates
docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE);
int doc = docs.nextDoc();
if (doc != DocIdSetIterator.NO_MORE_DOCS) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
}
}
int lastDoc = -1;
while (true) {
lastDoc = doc;
bits.clear(lastDoc);
doc = docs.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
// restore the last bit
bits.set(lastDoc);
}
}
}
}
return bits;
}
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public KeepMode getKeepMode() {
return keepMode;
}
public void setKeepMode(KeepMode keepMode) {
this.keepMode = keepMode;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
DuplicateFilter other = (DuplicateFilter) obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
fieldName != null && fieldName.equals(other.fieldName);
}
@Override
public int hashCode() {
int hash = 217;
hash = 31 * hash + keepMode.hashCode();
hash = 31 * hash + processingMode.hashCode();
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public ProcessingMode getProcessingMode() {
return processingMode;
}
public void setProcessingMode(ProcessingMode processingMode) {
this.processingMode = processingMode;
}
}