package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.Bits;
public class DuplicateFilter extends Filter
{
String fieldName;
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
int keepMode=KM_USE_FIRST_OCCURRENCE;
public static final int KM_USE_FIRST_OCCURRENCE=1;
public static final int KM_USE_LAST_OCCURRENCE=2;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read TermDocs for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*
*/
int processingMode=PM_FULL_VALIDATION;
public static final int PM_FULL_VALIDATION=1;
public static final int PM_FAST_INVALIDATION=2;
public DuplicateFilter(String fieldName)
{
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
{
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException
{
if(processingMode==PM_FAST_INVALIDATION)
{
return fastBits(reader);
}
else
{
return correctBits(reader);
}
}
private OpenBitSet correctBits(IndexReader reader) throws IOException {
OpenBitSet bits = new OpenBitSet(reader.maxDoc()); //assume all are INvalid
final Bits delDocs = MultiFields.getDeletedDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while(true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
docs = termsEnum.docs(delDocs, docs);
int doc = docs.nextDoc();
if (doc != docs.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
bits.set(doc);
} else {
int lastDoc = doc;
while (true) {
lastDoc = doc;
doc = docs.nextDoc();
if (doc == docs.NO_MORE_DOCS) {
break;
}
}
bits.set(lastDoc);
}
}
}
}
}
return bits;
}
private OpenBitSet fastBits(IndexReader reader) throws IOException
{
OpenBitSet bits=new OpenBitSet(reader.maxDoc());
bits.set(0,reader.maxDoc()); //assume all are valid
final Bits delDocs = MultiFields.getDeletedDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while(true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
if (termsEnum.docFreq() > 1) {
// unset potential duplicates
docs = termsEnum.docs(delDocs, docs);
int doc = docs.nextDoc();
if (doc != docs.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
}
}
int lastDoc = -1;
while (true) {
lastDoc = doc;
bits.clear(lastDoc);
doc = docs.nextDoc();
if (doc == docs.NO_MORE_DOCS) {
break;
}
}
if (keepMode==KM_USE_LAST_OCCURRENCE) {
// restore the last bit
bits.set(lastDoc);
}
}
}
}
}
return bits;
}
public String getFieldName()
{
return fieldName;
}
public void setFieldName(String fieldName)
{
this.fieldName = fieldName;
}
public int getKeepMode()
{
return keepMode;
}
public void setKeepMode(int keepMode)
{
this.keepMode = keepMode;
}
@Override
public boolean equals(Object obj)
{
if(this == obj)
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
return false;
DuplicateFilter other = (DuplicateFilter)obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
}
@Override
public int hashCode()
{
int hash = 217;
hash = 31 * hash + keepMode;
hash = 31 * hash + processingMode;
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public int getProcessingMode()
{
return processingMode;
}
public void setProcessingMode(int processingMode)
{
this.processingMode = processingMode;
}
}