/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MD5Hash; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateTools.Resolution; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.util.NutchConfiguration; import junit.framework.TestCase; public class TestDeleteDuplicates extends TestCase { Configuration conf; FileSystem fs; Path root; Path index1; Path index2; Path index3; Path index4; Path index5; public void setUp() throws Exception { conf = NutchConfiguration.create(); conf.set("fs.default.name", "file:///"); fs = FileSystem.get(conf); root = new Path("build/test/dedup2-test-" + new Random().nextInt()); // create test indexes index1 = createIndex("index1", true, 1.0f, 10L, false); index2 = createIndex("index2", false, 2.0f, 20L, true); index3 = createIndex("index3", true, 1.0f, 10L, true); index4 = createSingleDocIndex("index4", 1.0f, 10L); index5 = createSingleDocIndex("index5", 1.0f, 20L); } private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, "part-0000"); Directory dir = FSDirectory.getDirectory(sub.toString()); IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true, MaxFieldLength.UNLIMITED); Document doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/1", 1.0f + (incFirst ? inc : 0.0f), time); writer.addDocument(doc); if (hashDup) { doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/2", 1.0f + (!incFirst ? inc : 0.0f), time + 1); } else { doc = makeDoc(name, MD5Hash.digest("2").toString(), "http://www.example.com/1", 1.0f + (!incFirst ? inc : 0.0f), time + 1); } writer.addDocument(doc); writer.close(); return idx; } private Path createSingleDocIndex(String name, float inc, long time) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, "part-0000"); Directory dir = FSDirectory.getDirectory(sub.toString()); IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true, MaxFieldLength.UNLIMITED); Document doc = makeDoc(name, MD5Hash.digest("1").toString(), "http://www.example.com/1", 1.0f + inc, time + 1); writer.addDocument(doc); writer.close(); return idx; } private Document makeDoc(String segment, String digest, String url, float boost, long time) { Document doc = new Document(); doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO)); doc.add(new Field("digest", digest, Field.Store.YES, Field.Index.NO)); doc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED)); doc.setBoost(boost); doc.add(new Field("boost", "" + boost, Field.Store.YES, Field.Index.NO)); doc.add(new Field("tstamp", DateTools.timeToString(time, Resolution.MILLISECOND), Field.Store.YES, Field.Index.NO)); return doc; } public void tearDown() throws Exception { fs.delete(root, true); } private void hashDuplicatesHelper(Path index, String url) throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[]{index}); FsDirectory dir = new FsDirectory(fs, new Path(index, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", url, doc.get("url")); System.out.println(doc); } reader.close(); } public void testHashDuplicates() throws Exception { hashDuplicatesHelper(index1, "http://www.example.com/2"); hashDuplicatesHelper(index3, "http://www.example.com/1"); } public void testUrlDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[]{index2}); FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); } public void testMixedDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[]{index1, index2}); FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", "http://www.example.com/2", doc.get("url")); System.out.println(doc); } reader.close(); dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); } public void testRededuplicate() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[]{index4, index5}); dedup.dedup(new Path[]{index4, index5}); } }