PCP.java example

Explorer
Ivory-master
- src
  - java
package ivory.pwsim;

import ivory.core.RetrievalEnvironment;
import ivory.core.data.index.Posting;
import ivory.core.data.index.PostingsList;
import ivory.core.data.index.PostingsReader;
import ivory.core.data.stat.DocLengthTable;
import ivory.core.data.stat.DocLengthTable2B;
import ivory.pwsim.score.ScoringModel;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import tl.lin.data.map.HMapIFW;
import tl.lin.data.map.MapIF;
import edu.umd.cloud9.util.PowerTool;

/**
 * <p>
 * Computing pairwise document similarity given a document-sorted inverted
 * index. This implementation is based on the algorithms described in the
 * following papers:
 * </p>
 * 
 * <ul>
 * 
 * <li>Tamer Elsayed, Jimmy Lin, and Douglas Oard. <b><a
 * href="http://www.aclweb.org/anthology/P/P08/P08-2067.pdf">Pairwise Document
 * Similarity in Large Collections with MapReduce.</a></b> Proceedings of the
 * 46th Annual Meeting of the Association for Computational Linguistics (ACL
 * 2008), Companion Volume, pages 265-268, June 2008, Columbus, Ohio.
 * 
 * <li>Jimmy Lin. <b><a
 * href="http://portal.acm.org/citation.cfm?id=1571941.1571970">Brute Force and
 * Indexed Approaches to Pairwise Document Similarity Comparisons with
 * MapReduce.</a></b> Proceedings of the 32nd Annual International ACM SIGIR
 * Conference on Research and Development in Information Retrieval (SIGIR 2009),
 * pages 155-162, July 2009, Boston, Massachusetts.
 * 
 * </ul>
 * 
 * @author Tamer Elsayed
 * @author Jimmy Lin
 * 
 */
public class PCP extends PowerTool {

	private static final Logger sLogger = Logger.getLogger(PCP.class);
	{
		sLogger.setLevel(Level.INFO);
	}

	private static class MyMapper extends MapReduceBase implements
			Mapper<IntWritable, PostingsList, IntWritable, HMapIFW> {

		// table that contains length of all document, to be used in computing
		// similarity
		private DocLengthTable mDocLengthTable;

		// similarity measure
		private ScoringModel mModel;

		// threshold to filter common terms that don't contribute much in
		// similarities
		private int dfCut;

		// starting row (in similarity matrix) to be computed
		private int mBlockStart;

		// ending row (in similarity matrix) to be computed
		private int mBlockEnd;

		// collection size
		private int mCollectionDocCount;

		public void configure(JobConf job) {
			mCollectionDocCount = job.getInt("Ivory.CollectionDocumentCount", -1);

			try {
				if (job.get("mapred.job.tracker").equals("local")) {
					FileSystem fs = FileSystem.getLocal(job);
					RetrievalEnvironment re = new RetrievalEnvironment(job.get("Ivory.IndexPath"),
							fs);
					Path path = re.getDoclengthsData();
					sLogger.debug("Reading doclengths: " + path);
					mDocLengthTable = new DocLengthTable2B(path, fs);
				} else {
					Path[] localFiles = DistributedCache.getLocalCacheFiles(job);
					mDocLengthTable = new DocLengthTable2B(localFiles[0], FileSystem.getLocal(job));
				}
			} catch (Exception e) {
				throw new RuntimeException("Error initializing DocLengthTable!");
			}

			dfCut = job.getInt("Ivory.DfCut", -1);
			mBlockStart = job.getInt("Ivory.BlockStart", -1);
			mBlockEnd = job.getInt("Ivory.BlockEnd", -1);
			if (dfCut <= 0 || mBlockStart < 0 || mBlockEnd <= 0)
				throw new RuntimeException("Invalid config parameter(s): dfCut=" + dfCut
						+ ", blockStart=" + mBlockStart + ", blockEnd=" + mBlockEnd);

			try {
				mModel = (ScoringModel) Class.forName(job.get("Ivory.ScoringModel")).newInstance();
			} catch (Exception e) {
				throw new RuntimeException("Mappers failed to initialize!");
			}
			// this only needs to be set once for the entire collection
			mModel.setDocCount(mDocLengthTable.getDocCount());
			mModel.setAvgDocLength(mDocLengthTable.getAvgDocLength());

		}

		Posting e1 = new Posting();
		Posting e2 = new Posting();

		public void map(IntWritable key, PostingsList postings,
				OutputCollector<IntWritable, HMapIFW> output, Reporter reporter) throws IOException {

			sLogger.debug(mCollectionDocCount);
			postings.setCollectionDocumentCount(mCollectionDocCount);

			PostingsReader reader1 = postings.getPostingsReader();

			if (reader1.getNumberOfPostings() > dfCut)
				return;

			// set per postings list
			mModel.setDF(reader1.getNumberOfPostings());

			// performing PCP
			while (reader1.nextPosting(e1)) {

				// Here's a hidden dependency: How we do the blocking depends on
				// how the postings are sorted. If the postings are sorted in
				// ascending docno, then we can break out of the loop after
				// we've gone past the block bounds (as in code below).
				// Otherwise (say, if postings are sorted by tf), we have to go
				// through all postings.
				// -- Jimmy, 2008/09/03
				//

				if (e1.getDocno() < mBlockStart)
					continue;
				if (e1.getDocno() >= mBlockEnd)
					break;

				HMapIFW map = new HMapIFW();

				sLogger.debug(key + ": " + e1);

				PostingsReader reader2 = postings.getPostingsReader();

				while (reader2.nextPosting(e2)) {

					sLogger.debug(key + ": " + e1 + ", " + e2);

					if (e1.getDocno() == e2.getDocno())
						continue;

					// compute partial score of similarity for a pair of
					// documents
					float weight = mModel.computeScore(e1.getTf(), e2.getTf(),
							mDocLengthTable.getDocLength(e1.getDocno()), mDocLengthTable
									.getDocLength(e2.getDocno()));

					map.put(e2.getDocno(), weight);
				}
				output.collect(new IntWritable(e1.getDocno()), map);
			}
		}
	}

	private static class MyReducer extends MapReduceBase implements
			Reducer<IntWritable, HMapIFW, IntWritable, HMapIFW> {

		HMapIFW map = new HMapIFW();
		HMapIFW newMap = new HMapIFW();
		int topN = -1;

		public void configure(JobConf job) {
			topN = job.getInt("Ivory.TopN", -1);
		}

		public void reduce(IntWritable doc, Iterator<HMapIFW> values,
				OutputCollector<IntWritable, HMapIFW> output, Reporter reporter) throws IOException {

			map.clear();
			while (values.hasNext()) {
				map.plus(values.next());
			}
			newMap.clear();
			if (topN > 0) {
				// get only top N similar documents
				int i = 0;
				for (MapIF.Entry e : map.getEntriesSortedByValue()) {
					if (i >= topN)
						break;
					newMap.put(e.getKey(), e.getValue());
					i++;
				}
			} else {
				for (MapIF.Entry e : map.getEntriesSortedByValue())
					newMap.put(e.getKey(), e.getValue());
			}

			// note: output is not sorted but will only include top N if needed
			output.collect(doc, newMap);
		}
	}

	public PCP(Configuration conf) {
		super(conf);
	}

	public static final String[] RequiredParameters = {
			"Ivory.IndexPath",
			"Ivory.OutputPath",
			"Ivory.NumMapTasks",
			"Ivory.NumReduceTasks",
			"Ivory.ScoringModel",
			"Ivory.DfCut",
			"Ivory.BlockSize",
			"Ivory.TopN"
	};

	public String[] getRequiredParameters() {
		return RequiredParameters;
	}

	public int runTool() throws Exception {
		String indexPath = getConf().get("Ivory.IndexPath");
		String outputPath = getConf().get("Ivory.OutputPath");

		int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
		int reduceTasks = getConf().getInt("Ivory.NumReduceTasks", 0);
		int dfCut = getConf().getInt("Ivory.DfCut", -1);
		int blockSize = getConf().getInt("Ivory.BlockSize", -1);
		int topN = getConf().getInt("Ivory.TopN", -1);

		FileSystem fs = FileSystem.get(getConf());

		RetrievalEnvironment re = new RetrievalEnvironment(indexPath, fs);

		String collectionName = re.readCollectionName();
		int numDocs = re.readCollectionDocumentCount();
		Path docLengthPath = re.getDoclengthsData();
		String scoringModel = getConf().get("Ivory.ScoringModel");

		sLogger.info("Characteristics of the collection:");
		sLogger.info(" - CollectionName: " + collectionName);
		sLogger.info(" - IndexPath: " + indexPath);
		sLogger.info("Characteristics of the job:");
		sLogger.info(" - NumMapTasks: " + mapTasks);
		sLogger.info(" - NumReduceTasks: " + reduceTasks);
		sLogger.info(" - DfCut: " + getConf().getInt("Ivory.DfCut", 0));
		sLogger.info(" - BlockSize: " + blockSize);
		sLogger.info(" - ScoringModel: " + scoringModel);
		sLogger.info(" - topN: " + topN);
		sLogger.info(" - OutputPath: " + outputPath);

		getConf().setInt("Ivory.CollectionDocumentCount", numDocs);

		if (fs.exists(new Path(outputPath))) {
			System.out.println("PCP output path already exists!");
			return 0;
		}

		int numBlocks = numDocs / blockSize + 1;

		for (int i = 0; i < numBlocks; i++) {
			int start = blockSize * i;
			int end = i == numBlocks - 1 ? numDocs : blockSize * (i + 1);

			JobConf conf = new JobConf(getConf(), PCP.class);
			DistributedCache.addCacheFile(docLengthPath.toUri(), conf);

			sLogger.info("block " + i + ": " + start + "-" + end);

			conf.setInt("Ivory.BlockStart", start);
			conf.setInt("Ivory.BlockEnd", end);

			conf.setJobName("PCP:" + collectionName + "-dfCut=" + dfCut
					+ (topN > 0 ? "-topN" + topN : "-all") + ":Block #" + i);

			conf.setNumMapTasks(mapTasks);
			conf.setNumReduceTasks(reduceTasks);

			String currentOutputPath = outputPath + "/block" + i;

			FileInputFormat.setInputPaths(conf, new Path(re.getPostingsDirectory()));
			FileOutputFormat.setOutputPath(conf, new Path(currentOutputPath));

			conf.setInputFormat(SequenceFileInputFormat.class);
			conf.setOutputKeyClass(IntWritable.class);
			conf.setOutputValueClass(HMapIFW.class);
			conf.setOutputFormat(SequenceFileOutputFormat.class);

			conf.setMapperClass(MyMapper.class);
			conf.setCombinerClass(IdentityReducer.class);
			conf.setReducerClass(MyReducer.class);

			JobClient.runJob(conf);
		}

		return 0;
	}

}