/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
/**
* Delete duplicate documents in a set of Lucene indexes.
* Duplicates have either the same contents (via MD5 hash) or the same URL.
*
* This tool uses the following algorithm:
*
* <ul>
* <li><b>Phase 1 - remove URL duplicates:</b><br/>
* In this phase documents with the same URL
* are compared, and only the most recent document is retained -
* all other URL duplicates are scheduled for deletion.</li>
* <li><b>Phase 2 - remove content duplicates:</b><br/>
* In this phase documents with the same content hash are compared. If
* property "dedup.keep.highest.score" is set to true (default) then only
* the document with the highest score is retained. If this property is set
* to false, only the document with the shortest URL is retained - all other
* content duplicates are scheduled for deletion.</li>
* <li><b>Phase 3 - delete documents:</b><br/>
* In this phase documents scheduled for deletion are marked as deleted in
* Lucene index(es).</li>
* </ul>
*
* @author Andrzej Bialecki
*/
public class DeleteDuplicates extends Configured
implements Tool, Mapper<WritableComparable, Writable, Text, IntWritable>, Reducer<Text, IntWritable, WritableComparable, Writable>, OutputFormat<WritableComparable, Writable> {
private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class);
// Algorithm:
//
// 1. map indexes -> <url, <md5, url, time, urlLen, index,doc>>
// reduce, deleting all but most recent
//
// 2. map indexes -> <md5, <md5, url, time, urlLen, index,doc>>
// partition by md5
// reduce, deleting all but with highest score (or shortest url).
public static class IndexDoc implements WritableComparable {
private Text url = new Text();
private int urlLen;
private float score;
private long time;
private MD5Hash hash = new MD5Hash();
private Text index = new Text(); // the segment index
private int doc; // within the index
private boolean keep = true; // keep or discard
public String toString() {
return "[url=" + url + ",score=" + score + ",time=" + time
+ ",hash=" + hash + ",index=" + index + ",doc=" + doc
+ ",keep=" + keep + "]";
}
public void write(DataOutput out) throws IOException {
url.write(out);
out.writeFloat(score);
out.writeLong(time);
hash.write(out);
index.write(out);
out.writeInt(doc);
out.writeBoolean(keep);
}
public void readFields(DataInput in) throws IOException {
url.readFields(in);
urlLen = url.getLength();
score = in.readFloat();
time = in.readLong();
hash.readFields(in);
index.readFields(in);
doc = in.readInt();
keep = in.readBoolean();
}
public int compareTo(Object o) {
IndexDoc that = (IndexDoc)o;
if (this.keep != that.keep) {
return this.keep ? 1 : -1;
} else if (!this.hash.equals(that.hash)) { // order first by hash
return this.hash.compareTo(that.hash);
} else if (this.time != that.time) { // prefer more recent docs
return this.time > that.time ? 1 : -1 ;
} else if (this.urlLen != that.urlLen) { // prefer shorter urls
return this.urlLen - that.urlLen;
} else {
return this.score > that.score ? 1 : -1;
}
}
public boolean equals(Object o) {
IndexDoc that = (IndexDoc)o;
return this.keep == that.keep
&& this.hash.equals(that.hash)
&& this.time == that.time
&& this.score == that.score
&& this.urlLen == that.urlLen
&& this.index.equals(that.index)
&& this.doc == that.doc;
}
}
public static class InputFormat extends FileInputFormat<Text, IndexDoc> {
private static final long INDEX_LENGTH = Integer.MAX_VALUE;
/** Return each index as a split. */
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
FileStatus[] files = listStatus(job);
InputSplit[] splits = new InputSplit[files.length];
for (int i = 0; i < files.length; i++) {
FileStatus cur = files[i];
splits[i] = new FileSplit(cur.getPath(), 0, INDEX_LENGTH, (String[])null);
}
return splits;
}
public class DDRecordReader implements RecordReader<Text, IndexDoc> {
private IndexReader indexReader;
private int maxDoc = 0;
private int doc = 0;
private Text index;
public DDRecordReader(FileSplit split, JobConf job,
Text index) throws IOException {
try {
indexReader = IndexReader.open(new FsDirectory(FileSystem.get(job), split.getPath(), false, job));
maxDoc = indexReader.maxDoc();
} catch (IOException ioe) {
LOG.warn("Can't open index at " + split + ", skipping. (" + ioe.getMessage() + ")");
indexReader = null;
}
this.index = index;
}
public boolean next(Text key, IndexDoc indexDoc)
throws IOException {
// skip empty indexes
if (indexReader == null || maxDoc <= 0)
return false;
// skip deleted documents
while (doc < maxDoc && indexReader.isDeleted(doc)) doc++;
if (doc >= maxDoc)
return false;
Document document = indexReader.document(doc);
// fill in key
key.set(document.get("url"));
// fill in value
indexDoc.keep = true;
indexDoc.url.set(document.get("url"));
indexDoc.hash.setDigest(document.get("digest"));
indexDoc.score = Float.parseFloat(document.get("boost"));
try {
indexDoc.time = DateTools.stringToTime(document.get("tstamp"));
} catch (Exception e) {
// try to figure out the time from segment name
try {
String segname = document.get("segment");
indexDoc.time = new SimpleDateFormat("yyyyMMddHHmmss").parse(segname).getTime();
// make it unique
indexDoc.time += doc;
} catch (Exception e1) {
// use current time
indexDoc.time = System.currentTimeMillis();
}
}
indexDoc.index = index;
indexDoc.doc = doc;
doc++;
return true;
}
public long getPos() throws IOException {
return maxDoc == 0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
}
public void close() throws IOException {
if (indexReader != null) indexReader.close();
}
public Text createKey() {
return new Text();
}
public IndexDoc createValue() {
return new IndexDoc();
}
public float getProgress() throws IOException {
return maxDoc == 0 ? 0.0f : (float)doc / (float)maxDoc;
}
}
/** Return each index as a split. */
public RecordReader<Text, IndexDoc> getRecordReader(InputSplit split,
JobConf job,
Reporter reporter) throws IOException {
FileSplit fsplit = (FileSplit)split;
Text index = new Text(fsplit.getPath().toString());
reporter.setStatus(index.toString());
return new DDRecordReader(fsplit, job, index);
}
}
public static class HashPartitioner implements Partitioner<MD5Hash, Writable> {
public void configure(JobConf job) {}
public void close() {}
public int getPartition(MD5Hash key, Writable value,
int numReduceTasks) {
int hashCode = key.hashCode();
return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
}
}
public static class UrlsReducer implements Reducer<Text, IndexDoc, MD5Hash, IndexDoc> {
public void configure(JobConf job) {}
public void close() {}
private IndexDoc latest = new IndexDoc();
public void reduce(Text key, Iterator<IndexDoc> values,
OutputCollector<MD5Hash, IndexDoc> output, Reporter reporter) throws IOException {
WritableUtils.cloneInto(latest, values.next());
while (values.hasNext()) {
IndexDoc value = values.next();
if (value.time > latest.time) {
// discard current and use more recent
latest.keep = false;
LOG.debug("-discard " + latest + ", keep " + value);
output.collect(latest.hash, latest);
WritableUtils.cloneInto(latest, value);
} else {
// discard
value.keep = false;
LOG.debug("-discard " + value + ", keep " + latest);
output.collect(value.hash, value);
}
}
// keep the latest
latest.keep = true;
output.collect(latest.hash, latest);
}
}
public static class HashReducer implements Reducer<MD5Hash, IndexDoc, Text, IndexDoc> {
boolean byScore;
public void configure(JobConf job) {
byScore = job.getBoolean("dedup.keep.highest.score", true);
}
public void close() {}
private IndexDoc highest = new IndexDoc();
public void reduce(MD5Hash key, Iterator<IndexDoc> values,
OutputCollector<Text, IndexDoc> output, Reporter reporter)
throws IOException {
boolean highestSet = false;
while (values.hasNext()) {
IndexDoc value = values.next();
// skip already deleted
if (!value.keep) {
LOG.debug("-discard " + value + " (already marked)");
output.collect(value.url, value);
continue;
}
if (!highestSet) {
WritableUtils.cloneInto(highest, value);
highestSet = true;
continue;
}
IndexDoc toDelete = null, toKeep = null;
boolean metric = byScore ? (value.score > highest.score) :
(value.urlLen < highest.urlLen);
if (metric) {
toDelete = highest;
toKeep = value;
} else {
toDelete = value;
toKeep = highest;
}
if (LOG.isDebugEnabled()) {
LOG.debug("-discard " + toDelete + ", keep " + toKeep);
}
toDelete.keep = false;
output.collect(toDelete.url, toDelete);
WritableUtils.cloneInto(highest, toKeep);
}
LOG.debug("-keep " + highest);
// no need to add this - in phase 2 we only process docs to delete them
// highest.keep = true;
// output.collect(key, highest);
}
}
private FileSystem fs;
public void configure(JobConf job) {
setConf(job);
}
public void setConf(Configuration conf) {
super.setConf(conf);
try {
if(conf != null) fs = FileSystem.get(conf);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public void close() {}
/** Map [*,IndexDoc] pairs to [index,doc] pairs. */
public void map(WritableComparable key, Writable value,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
IndexDoc indexDoc = (IndexDoc)value;
// don't delete these
if (indexDoc.keep) return;
// delete all others
output.collect(indexDoc.index, new IntWritable(indexDoc.doc));
}
/** Delete docs named in values from index named in key. */
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<WritableComparable, Writable> output, Reporter reporter)
throws IOException {
Path index = new Path(key.toString());
IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
try {
while (values.hasNext()) {
IntWritable value = values.next();
LOG.debug("-delete " + index + " doc=" + value);
reader.deleteDocument(value.get());
}
} finally {
reader.close();
}
}
/** Write nothing. */
public RecordWriter<WritableComparable, Writable> getRecordWriter(final FileSystem fs,
final JobConf job,
final String name,
final Progressable progress) throws IOException {
return new RecordWriter<WritableComparable, Writable>() {
public void write(WritableComparable key, Writable value)
throws IOException {
throw new UnsupportedOperationException();
}
public void close(Reporter reporter) throws IOException {}
};
}
public DeleteDuplicates() {
}
public DeleteDuplicates(Configuration conf) {
setConf(conf);
}
public void checkOutputSpecs(FileSystem fs, JobConf job) {}
public void dedup(Path[] indexDirs)
throws IOException {
if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }
Path outDir1 =
new Path("dedup-urls-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(getConf());
for (int i = 0; i < indexDirs.length; i++) {
if (LOG.isInfoEnabled()) {
LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
}
FileInputFormat.addInputPath(job, indexDirs[i]);
}
job.setJobName("dedup 1: urls by time");
job.setInputFormat(InputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IndexDoc.class);
job.setReducerClass(UrlsReducer.class);
FileOutputFormat.setOutputPath(job, outDir1);
job.setOutputKeyClass(MD5Hash.class);
job.setOutputValueClass(IndexDoc.class);
job.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(job);
Path outDir2 =
new Path("dedup-hash-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
job = new NutchJob(getConf());
job.setJobName("dedup 2: content by hash");
FileInputFormat.addInputPath(job, outDir1);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapOutputKeyClass(MD5Hash.class);
job.setMapOutputValueClass(IndexDoc.class);
job.setPartitionerClass(HashPartitioner.class);
job.setSpeculativeExecution(false);
job.setReducerClass(HashReducer.class);
FileOutputFormat.setOutputPath(job, outDir2);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IndexDoc.class);
job.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(job);
// remove outDir1 - no longer needed
fs.delete(outDir1, true);
job = new NutchJob(getConf());
job.setJobName("dedup 3: delete from index(es)");
FileInputFormat.addInputPath(job, outDir2);
job.setInputFormat(SequenceFileInputFormat.class);
//job.setInputKeyClass(Text.class);
//job.setInputValueClass(IndexDoc.class);
job.setInt("io.file.buffer.size", 4096);
job.setMapperClass(DeleteDuplicates.class);
job.setReducerClass(DeleteDuplicates.class);
job.setOutputFormat(DeleteDuplicates.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
JobClient.runJob(job);
fs.delete(outDir2, true);
if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); }
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new DeleteDuplicates(), args);
System.exit(res);
}
public int run(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: DeleteDuplicates <indexes> ...");
return -1;
}
Path[] indexes = new Path[args.length];
for (int i = 0; i < args.length; i++) {
indexes[i] = new Path(args[i]);
}
try {
dedup(indexes);
return 0;
} catch (Exception e) {
LOG.fatal("DeleteDuplicates: " + StringUtils.stringifyException(e));
return -1;
}
}
}