DuplicateFilterTest.java example

Explorer
solrcene-master
package org.apache.lucene.search;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.HashSet;
import java.util.Random;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;

public class DuplicateFilterTest extends LuceneTestCase {
	private static final String KEY_FIELD = "url";
	private Directory directory;
	private IndexReader reader;
	TermQuery tq=new TermQuery(new Term("text","lucene"));
	private IndexSearcher searcher;

	@Override
	protected void setUp() throws Exception {
    super.setUp();
    Random random = newRandom();
		directory = newDirectory(random);
		RandomIndexWriter writer = new RandomIndexWriter(random, directory);
		
		//Add series of docs with filterable fields : url, text and dates  flags
		addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
		addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
		addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");		
		addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
		addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
		addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
		addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
		addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");

                // Until we fix LUCENE-2348, the index must
                // have only 1 segment:
                writer.optimize();

		reader = writer.getReader();
		writer.close();			
		searcher =new IndexSearcher(reader);
		
	}
	
	@Override
	protected void tearDown() throws Exception {
		reader.close();
		searcher.close();
		directory.close();
		super.tearDown();
	}

	private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
	{
		Document doc=new Document();
		doc.add(new Field(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
		doc.add(new Field("text",text,Field.Store.YES,Field.Index.ANALYZED));
		doc.add(new Field("date",date,Field.Store.YES,Field.Index.ANALYZED));
		writer.addDocument(doc);
	}
		
	public void testDefaultFilter() throws Throwable
	{
		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);		
		HashSet<String> results=new HashSet<String>();
		ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
		for(int i=0;i<hits.length;i++)
		{
			Document d=searcher.doc(hits[i].doc);
			String url=d.get(KEY_FIELD);
			assertFalse("No duplicate urls should be returned",results.contains(url));
			results.add(url);
		}
	}
	public void testNoFilter() throws Throwable
	{
		HashSet<String> results=new HashSet<String>();
		ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
		assertTrue("Default searching should have found some matches",hits.length>0);
		boolean dupsFound=false;
		for(int i=0;i<hits.length;i++)
		{
			Document d=searcher.doc(hits[i].doc);
			String url=d.get(KEY_FIELD);
			if(!dupsFound)
				dupsFound=results.contains(url);
			results.add(url);
		}
		assertTrue("Default searching should have found duplicate urls",dupsFound);
	}
	
	public void testFastFilter() throws Throwable
	{
		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
		df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
		HashSet<String> results=new HashSet<String>();
		ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
		assertTrue("Filtered searching should have found some matches",hits.length>0);
		for(int i=0;i<hits.length;i++)
		{
			Document d=searcher.doc(hits[i].doc);
			String url=d.get(KEY_FIELD);
			assertFalse("No duplicate urls should be returned",results.contains(url));
			results.add(url);
		}
		assertEquals("Two urls found",2, results.size());
	}	
	public void testKeepsLastFilter() throws Throwable
	{
		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
		df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
		ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
		assertTrue("Filtered searching should have found some matches",hits.length>0);
		for(int i=0;i<hits.length;i++)
		{
			Document d=searcher.doc(hits[i].doc);
			String url=d.get(KEY_FIELD);
                        DocsEnum td = MultiFields.getTermDocsEnum(reader,
                                                                  MultiFields.getDeletedDocs(reader),
                                                                  KEY_FIELD,
                                                                  new BytesRef(url));
			int lastDoc=0;
			while(td.nextDoc() != DocsEnum.NO_MORE_DOCS)
			{
				lastDoc=td.docID();
			}
			assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
		}
	}	
	
	
	public void testKeepsFirstFilter() throws Throwable
	{
		DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
		df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
		ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
		assertTrue("Filtered searching should have found some matches",hits.length>0);
		for(int i=0;i<hits.length;i++)
		{
			Document d=searcher.doc(hits[i].doc);
			String url=d.get(KEY_FIELD);
                        DocsEnum td = MultiFields.getTermDocsEnum(reader,
                                                                  MultiFields.getDeletedDocs(reader),
                                                                  KEY_FIELD,
                                                                  new BytesRef(url));
			int lastDoc=0;
			td.nextDoc();
			lastDoc=td.docID();
			assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
		}
	}	
	
	
}