PairWiseDistance.java example

Explorer
Stock-Analysis-master
/**
 * Software License, Version 1.0
 * 
 * Copyright 2003 The Trustees of Indiana University.  All rights reserved.
 * 
 *
 *Redistribution and use in source and binary forms, with or without 
 *modification, are permitted provided that the following conditions are met:
 *
 *1) All redistributions of source code must retain the above copyright notice,
 * the list of authors in the original source code, this list of conditions and
 * the disclaimer listed in this license;
 *2) All redistributions in binary form must reproduce the above copyright 
 * notice, this list of conditions and the disclaimer listed in this license in
 * the documentation and/or other materials provided with the distribution;
 *3) Any documentation included with all redistributions must include the 
 * following acknowledgement:
 *
 *"This product includes software developed by the Community Grids Lab. For 
 * further information contact the Community Grids Lab at 
 * http://communitygrids.iu.edu/."
 *
 * Alternatively, this acknowledgement may appear in the software itself, and 
 * wherever such third-party acknowledgments normally appear.
 * 
 *4) The name Indiana University or Community Grids Lab or NaradaBrokering, 
 * shall not be used to endorse or promote products derived from this software 
 * without prior written permission from Indiana University.  For written 
 * permission, please contact the Advanced Research and Technology Institute 
 * ("ARTI") at 351 West 10th Street, Indianapolis, Indiana 46202.
 *5) Products derived from this software may not be called NaradaBrokering, 
 * nor may Indiana University or Community Grids Lab or NaradaBrokering appear
 * in their name, without prior written permission of ARTI.
 * 
 *
 * Indiana University provides no reassurances that the source code provided 
 * does not infringe the patent or any other intellectual property rights of 
 * any other entity.  Indiana University disclaims any liability to any 
 * recipient for claims brought by any other entity based on infringement of 
 * intellectual property rights or otherwise.  
 *
 *LICENSEE UNDERSTANDS THAT SOFTWARE IS PROVIDED "AS IS" FOR WHICH NO 
 *WARRANTIES AS TO CAPABILITIES OR ACCURACY ARE MADE. INDIANA UNIVERSITY GIVES
 *NO WARRANTIES AND MAKES NO REPRESENTATION THAT SOFTWARE IS FREE OF 
 *INFRINGEMENT OF THIRD PARTY PATENT, COPYRIGHT, OR OTHER PROPRIETARY RIGHTS. 
 *INDIANA UNIVERSITY MAKES NO WARRANTIES THAT SOFTWARE IS FREE FROM "BUGS", 
 *"VIRUSES", "TROJAN HORSES", "TRAP DOORS", "WORMS", OR OTHER HARMFUL CODE.  
 *LICENSEE ASSUMES THE ENTIRE RISK AS TO THE PERFORMANCE OF SOFTWARE AND/OR 
 *ASSOCIATED MATERIALS, AND TO THE PERFORMANCE AND VALIDITY OF INFORMATION 
 *GENERATED USING SOFTWARE.
 */

package edu.indiana.soic.ts.mapreduce.pwd;

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

import edu.indiana.soic.ts.utils.TSConfiguration;
import edu.indiana.soic.ts.utils.Utils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PairWiseDistance {
	private static final Logger LOG = LoggerFactory.getLogger(PairWiseDistance.class);
	private int blockSize;
	private String distFunc;
	private String interDistDir;
	private String distDir;
	private String vectDir;

	public static void main(String[] args) throws Exception {
		PairWiseDistance pwd = new PairWiseDistance();
		pwd.configure(args);
		pwd.submitJob();
	}

	public int execJob(Configuration conf, String sequenceFileFullPath, String sequenceFile, String distDir) throws Exception {
		/* input parameters */
        LOG.info(sequenceFileFullPath);
		Job job = new Job(conf, "Pairwise-calc-" + sequenceFile);

		/* create the base dir for this job. Delete and recreates if it exists */
		Path hdMainDir = new Path(distDir + "/" + sequenceFile);
        FileSystem fs = FileSystem.get(conf);
		fs.delete(hdMainDir, true);
		Path hdInputDir = new Path(hdMainDir, "data");
		if (!fs.mkdirs(hdInputDir)) {
			throw new IOException("Mkdirs failed to create " + hdInputDir.toString());
		}

		int noOfSequences = getNoOfSequences(sequenceFileFullPath, fs);
		int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize);
		int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2;
		LOG.info("No of divisions :" + noOfDivisions + "\nNo of blocks :" +
				noOfBlocks + "\nBlock size :" + blockSize);

		// Retrieving the configuration form the job to set the properties
		// Setting properties to the original conf does not work (possible
		// Hadoop bug)
		Configuration jobConf = job.getConfiguration();

		// Input dir in HDFS. Create this in newly created job base dir
		Path inputDir = new Path(hdMainDir, "input");
		if (!fs.mkdirs(inputDir)) {
			throw new IOException("Mkdirs failed to create "
					+ inputDir.toString());
		}

		Long dataPartitionStartTime = System.nanoTime();
		partitionData(sequenceFileFullPath, noOfSequences, blockSize, fs,
				noOfDivisions, jobConf, inputDir);

		distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions);

		long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000;
		LOG.info("Data Partition & Scatter Completed in (ms):"
				+ dataPartTime);

		// Output dir in HDFS
		Path hdOutDir = new Path(hdMainDir, "out");

		jobConf.setInt(Constants.BLOCK_SIZE, blockSize);
		jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions);
		jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences);
        jobConf.set(Constants.DIST_FUNC, distFunc);

		job.setJarByClass(PairWiseDistance.class);
		job.setMapperClass(SWGMap.class);
		job.setReducerClass(SWGReduce.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(SWGWritable.class);
		FileInputFormat.setInputPaths(job, hdInputDir);
		FileOutputFormat.setOutputPath(job, hdOutDir);
		job.setInputFormatClass(SequenceFileInputFormat.class);
		job.setOutputFormatClass(SequenceFileOutputFormat.class);
		job.setNumReduceTasks(noOfDivisions);

		long startTime = System.currentTimeMillis();
		int exitStatus = job.waitForCompletion(true) ? 0 : 1;
		double executionTime = (System.currentTimeMillis() - startTime) / 1000.0;
		LOG.info("Job Finished in " + executionTime + " seconds");
		LOG.info("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput" + noOfSequences + "\t" + noOfBlocks + "\t"
				+ executionTime + "\t" + sequenceFileFullPath + "\t" + dataPartTime
				+ "\t" + hdMainDir);

		return exitStatus;
	}

	public void configure(String[] args) {
		String  configFile = Utils.getConfigurationFile(args);
		TSConfiguration tsConfiguration = new TSConfiguration(configFile);
		Map tsConf = tsConfiguration.getConf();

		this.blockSize = (int) tsConf.get(TSConfiguration.MATRIX_BLOCK_SIZE);
		this.distFunc = (String) tsConf.get(TSConfiguration.DISTANCE_FUNCTION);
		this.interDistDir = tsConfiguration.getInterMediateDistanceDir();
		this.distDir = tsConfiguration.getDistDir();
		this.vectDir = tsConfiguration.getVectorDir();
	}

	public void submitJob() throws IOException {
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		FileStatus[] status = fs.listStatus(new Path(vectDir));
		for (int i = 0; i < status.length; i++) {
			String sequenceFile = status[i].getPath().getName();
			String sequenceFileFullPath = vectDir + "/" + sequenceFile;
			try {
				execJob(conf, sequenceFileFullPath, sequenceFile, interDistDir);
				concatOutput(conf, sequenceFile, interDistDir, distDir);
			} catch (Exception e) {
				String message = "Failed to executed PWD calculation:" + sequenceFileFullPath + " " + interDistDir;
				LOG.info(message, e);
				throw new RuntimeException(message);
			}
		}
	}

	private class OutFile implements Comparable<OutFile> {
		int no;
		String file;

		public OutFile(int no, String file) {
			this.no = no;
			this.file = file;
		}

		@Override
		public int compareTo(OutFile o) {
			return o.no - this.no;
		}
	}

	public void concatOutput(Configuration conf, String sequenceFile, String distDirIntermediate, String distDir) throws IOException {
		FileSystem fs = FileSystem.get(conf);
		Path outDir = new Path(distDirIntermediate + "/" + sequenceFile + "/out");
		FileStatus[] status = fs.listStatus(outDir);
		List<OutFile> outFiles = new ArrayList<OutFile>();
		for (int i = 0; i < status.length; i++) {
			String name = status[i].getPath().getName();
			String split[] = name.split("_");
			if (split.length > 2 && split[0].equals("row")) {
				OutFile o = new OutFile(Integer.parseInt(split[1]), name);
				outFiles.add(o);
			}
		}

		Collections.sort(outFiles);
		String destFile = distDir + "/" + sequenceFile;
		Path outFile = new Path(destFile);
		FSDataOutputStream outputStream = fs.create(outFile);
		for (OutFile o : outFiles) {
			Path inFile = new Path(outDir, o.file);
			FSDataInputStream inputStream = fs.open(inFile);
			IOUtils.copy(inputStream, outputStream);
			inputStream.close();
		}
		outputStream.flush();
		outputStream.close();
	}

	private void distributeData(int blockSize, Configuration conf,
			FileSystem fs, Path hdInputDir, int noOfDivisions) throws IOException {
		// Writing block meta data to for each block in a separate file so that
		// Hadoop will create separate Map tasks for each block..
		// Key : block number
		// Value: row#column#isDiagonal#base_file_name
		// TODO : find a better way to do this.
		for (int row = 0; row < noOfDivisions; row++) {
			for (int column = 0; column < noOfDivisions; column++) {
				// using the load balancing algorithm to select the blocks
				// include the diagonal blocks as they are blocks, not
				// individual pairs
				if (((row >= column) & ((row + column) % 2 == 0))
						| ((row <= column) & ((row + column) % 2 == 1))) {
					Path vFile = new Path(hdInputDir, "data_file_" + row + "_"
							+ column);
					SequenceFile.Writer vWriter = SequenceFile.createWriter(fs,
							conf, vFile, LongWritable.class, Text.class,
							CompressionType.NONE);

					boolean isDiagonal = false;
					if (row == column) {
						isDiagonal = true;
					}
					String value = row + Constants.BREAK + column
							+ Constants.BREAK + isDiagonal + Constants.BREAK
							+ Constants.HDFS_SEQ_FILENAME;
					vWriter.append(new LongWritable(row * blockSize + column),
							new Text(value));
					vWriter.close();
				}
			}
		}
	}

	private int getNoOfSequences(String sequenceFile, FileSystem fs) throws FileNotFoundException,
			IOException, URISyntaxException {
		Path path = new Path(sequenceFile);
		int count = 0;
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fs.open(path)));
		while ((bufferedReader.readLine()) != null) {
			count++;
		}
		bufferedReader.close();
		return count;
	}

	private void partitionData(String sequenceFile, int noOfSequences,
			int blockSize, FileSystem fs, int noOfDivisions,
			Configuration jobConf, Path inputDir) throws FileNotFoundException,
			IOException, URISyntaxException {
		// Break the sequences file in to parts based on the block size. Stores
		// the parts in HDFS and add them to the Hadoop distributed cache.
        Path path = new Path(sequenceFile);
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fs.open(path)));

		LOG.info("noOfDivisions : " + noOfDivisions);
		LOG.info("blockSize : " + blockSize);
		for (int partNo = 0; partNo < noOfDivisions; partNo++) {
			//
			String filePartName = Constants.HDFS_SEQ_FILENAME + "_" + partNo;
			Path inputFilePart = new Path(inputDir, filePartName);
			OutputStream partOutStream = fs.create(inputFilePart);
			BufferedWriter bufferedWriter = new BufferedWriter(
					new OutputStreamWriter(partOutStream));

			for (int sequenceIndex = 0; ((sequenceIndex < blockSize) & (sequenceIndex
					+ (partNo * blockSize) < noOfSequences)); sequenceIndex++) {
				String line;
                line = bufferedReader.readLine();
                if (line == null) {
					throw new IOException(
							"Cannot read the sequence from input file.");
				}
				// write the sequence name
				bufferedWriter.write(line);
				bufferedWriter.newLine();
			}
			bufferedWriter.flush();
			bufferedWriter.close();
			// Adding the sequences file to Hadoop cache
			URI cFileURI = new URI(inputFilePart.toUri() + "#" + filePartName);
			DistributedCache.addCacheFile(cFileURI, jobConf);
			DistributedCache.createSymlink(jobConf);
		}
	}
}