/** * Software License, Version 1.0 * * Copyright 2003 The Trustees of Indiana University. All rights reserved. * * *Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: * *1) All redistributions of source code must retain the above copyright notice, * the list of authors in the original source code, this list of conditions and * the disclaimer listed in this license; *2) All redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the disclaimer listed in this license in * the documentation and/or other materials provided with the distribution; *3) Any documentation included with all redistributions must include the * following acknowledgement: * *"This product includes software developed by the Community Grids Lab. For * further information contact the Community Grids Lab at * http://communitygrids.iu.edu/." * * Alternatively, this acknowledgement may appear in the software itself, and * wherever such third-party acknowledgments normally appear. * *4) The name Indiana University or Community Grids Lab or NaradaBrokering, * shall not be used to endorse or promote products derived from this software * without prior written permission from Indiana University. For written * permission, please contact the Advanced Research and Technology Institute * ("ARTI") at 351 West 10th Street, Indianapolis, Indiana 46202. *5) Products derived from this software may not be called NaradaBrokering, * nor may Indiana University or Community Grids Lab or NaradaBrokering appear * in their name, without prior written permission of ARTI. * * * Indiana University provides no reassurances that the source code provided * does not infringe the patent or any other intellectual property rights of * any other entity. Indiana University disclaims any liability to any * recipient for claims brought by any other entity based on infringement of * intellectual property rights or otherwise. * *LICENSEE UNDERSTANDS THAT SOFTWARE IS PROVIDED "AS IS" FOR WHICH NO *WARRANTIES AS TO CAPABILITIES OR ACCURACY ARE MADE. INDIANA UNIVERSITY GIVES *NO WARRANTIES AND MAKES NO REPRESENTATION THAT SOFTWARE IS FREE OF *INFRINGEMENT OF THIRD PARTY PATENT, COPYRIGHT, OR OTHER PROPRIETARY RIGHTS. *INDIANA UNIVERSITY MAKES NO WARRANTIES THAT SOFTWARE IS FREE FROM "BUGS", *"VIRUSES", "TROJAN HORSES", "TRAP DOORS", "WORMS", OR OTHER HARMFUL CODE. *LICENSEE ASSUMES THE ENTIRE RISK AS TO THE PERFORMANCE OF SOFTWARE AND/OR *ASSOCIATED MATERIALS, AND TO THE PERFORMANCE AND VALIDITY OF INFORMATION *GENERATED USING SOFTWARE. */ package edu.indiana.soic.ts.mapreduce.pwd; import java.io.*; import java.net.URI; import java.net.URISyntaxException; import java.util.*; import edu.indiana.soic.ts.utils.TSConfiguration; import edu.indiana.soic.ts.utils.Utils; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class PairWiseDistance { private static final Logger LOG = LoggerFactory.getLogger(PairWiseDistance.class); private int blockSize; private String distFunc; private String interDistDir; private String distDir; private String vectDir; public static void main(String[] args) throws Exception { PairWiseDistance pwd = new PairWiseDistance(); pwd.configure(args); pwd.submitJob(); } public int execJob(Configuration conf, String sequenceFileFullPath, String sequenceFile, String distDir) throws Exception { /* input parameters */ LOG.info(sequenceFileFullPath); Job job = new Job(conf, "Pairwise-calc-" + sequenceFile); /* create the base dir for this job. Delete and recreates if it exists */ Path hdMainDir = new Path(distDir + "/" + sequenceFile); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdInputDir = new Path(hdMainDir, "data"); if (!fs.mkdirs(hdInputDir)) { throw new IOException("Mkdirs failed to create " + hdInputDir.toString()); } int noOfSequences = getNoOfSequences(sequenceFileFullPath, fs); int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize); int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2; LOG.info("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :" + blockSize); // Retrieving the configuration form the job to set the properties // Setting properties to the original conf does not work (possible // Hadoop bug) Configuration jobConf = job.getConfiguration(); // Input dir in HDFS. Create this in newly created job base dir Path inputDir = new Path(hdMainDir, "input"); if (!fs.mkdirs(inputDir)) { throw new IOException("Mkdirs failed to create " + inputDir.toString()); } Long dataPartitionStartTime = System.nanoTime(); partitionData(sequenceFileFullPath, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir); distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions); long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000; LOG.info("Data Partition & Scatter Completed in (ms):" + dataPartTime); // Output dir in HDFS Path hdOutDir = new Path(hdMainDir, "out"); jobConf.setInt(Constants.BLOCK_SIZE, blockSize); jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions); jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences); jobConf.set(Constants.DIST_FUNC, distFunc); job.setJarByClass(PairWiseDistance.class); job.setMapperClass(SWGMap.class); job.setReducerClass(SWGReduce.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SWGWritable.class); FileInputFormat.setInputPaths(job, hdInputDir); FileOutputFormat.setOutputPath(job, hdOutDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(noOfDivisions); long startTime = System.currentTimeMillis(); int exitStatus = job.waitForCompletion(true) ? 0 : 1; double executionTime = (System.currentTimeMillis() - startTime) / 1000.0; LOG.info("Job Finished in " + executionTime + " seconds"); LOG.info("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput" + noOfSequences + "\t" + noOfBlocks + "\t" + executionTime + "\t" + sequenceFileFullPath + "\t" + dataPartTime + "\t" + hdMainDir); return exitStatus; } public void configure(String[] args) { String configFile = Utils.getConfigurationFile(args); TSConfiguration tsConfiguration = new TSConfiguration(configFile); Map tsConf = tsConfiguration.getConf(); this.blockSize = (int) tsConf.get(TSConfiguration.MATRIX_BLOCK_SIZE); this.distFunc = (String) tsConf.get(TSConfiguration.DISTANCE_FUNCTION); this.interDistDir = tsConfiguration.getInterMediateDistanceDir(); this.distDir = tsConfiguration.getDistDir(); this.vectDir = tsConfiguration.getVectorDir(); } public void submitJob() throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(vectDir)); for (int i = 0; i < status.length; i++) { String sequenceFile = status[i].getPath().getName(); String sequenceFileFullPath = vectDir + "/" + sequenceFile; try { execJob(conf, sequenceFileFullPath, sequenceFile, interDistDir); concatOutput(conf, sequenceFile, interDistDir, distDir); } catch (Exception e) { String message = "Failed to executed PWD calculation:" + sequenceFileFullPath + " " + interDistDir; LOG.info(message, e); throw new RuntimeException(message); } } } private class OutFile implements Comparable<OutFile> { int no; String file; public OutFile(int no, String file) { this.no = no; this.file = file; } @Override public int compareTo(OutFile o) { return o.no - this.no; } } public void concatOutput(Configuration conf, String sequenceFile, String distDirIntermediate, String distDir) throws IOException { FileSystem fs = FileSystem.get(conf); Path outDir = new Path(distDirIntermediate + "/" + sequenceFile + "/out"); FileStatus[] status = fs.listStatus(outDir); List<OutFile> outFiles = new ArrayList<OutFile>(); for (int i = 0; i < status.length; i++) { String name = status[i].getPath().getName(); String split[] = name.split("_"); if (split.length > 2 && split[0].equals("row")) { OutFile o = new OutFile(Integer.parseInt(split[1]), name); outFiles.add(o); } } Collections.sort(outFiles); String destFile = distDir + "/" + sequenceFile; Path outFile = new Path(destFile); FSDataOutputStream outputStream = fs.create(outFile); for (OutFile o : outFiles) { Path inFile = new Path(outDir, o.file); FSDataInputStream inputStream = fs.open(inFile); IOUtils.copy(inputStream, outputStream); inputStream.close(); } outputStream.flush(); outputStream.close(); } private void distributeData(int blockSize, Configuration conf, FileSystem fs, Path hdInputDir, int noOfDivisions) throws IOException { // Writing block meta data to for each block in a separate file so that // Hadoop will create separate Map tasks for each block.. // Key : block number // Value: row#column#isDiagonal#base_file_name // TODO : find a better way to do this. for (int row = 0; row < noOfDivisions; row++) { for (int column = 0; column < noOfDivisions; column++) { // using the load balancing algorithm to select the blocks // include the diagonal blocks as they are blocks, not // individual pairs if (((row >= column) & ((row + column) % 2 == 0)) | ((row <= column) & ((row + column) % 2 == 1))) { Path vFile = new Path(hdInputDir, "data_file_" + row + "_" + column); SequenceFile.Writer vWriter = SequenceFile.createWriter(fs, conf, vFile, LongWritable.class, Text.class, CompressionType.NONE); boolean isDiagonal = false; if (row == column) { isDiagonal = true; } String value = row + Constants.BREAK + column + Constants.BREAK + isDiagonal + Constants.BREAK + Constants.HDFS_SEQ_FILENAME; vWriter.append(new LongWritable(row * blockSize + column), new Text(value)); vWriter.close(); } } } } private int getNoOfSequences(String sequenceFile, FileSystem fs) throws FileNotFoundException, IOException, URISyntaxException { Path path = new Path(sequenceFile); int count = 0; BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fs.open(path))); while ((bufferedReader.readLine()) != null) { count++; } bufferedReader.close(); return count; } private void partitionData(String sequenceFile, int noOfSequences, int blockSize, FileSystem fs, int noOfDivisions, Configuration jobConf, Path inputDir) throws FileNotFoundException, IOException, URISyntaxException { // Break the sequences file in to parts based on the block size. Stores // the parts in HDFS and add them to the Hadoop distributed cache. Path path = new Path(sequenceFile); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fs.open(path))); LOG.info("noOfDivisions : " + noOfDivisions); LOG.info("blockSize : " + blockSize); for (int partNo = 0; partNo < noOfDivisions; partNo++) { // String filePartName = Constants.HDFS_SEQ_FILENAME + "_" + partNo; Path inputFilePart = new Path(inputDir, filePartName); OutputStream partOutStream = fs.create(inputFilePart); BufferedWriter bufferedWriter = new BufferedWriter( new OutputStreamWriter(partOutStream)); for (int sequenceIndex = 0; ((sequenceIndex < blockSize) & (sequenceIndex + (partNo * blockSize) < noOfSequences)); sequenceIndex++) { String line; line = bufferedReader.readLine(); if (line == null) { throw new IOException( "Cannot read the sequence from input file."); } // write the sequence name bufferedWriter.write(line); bufferedWriter.newLine(); } bufferedWriter.flush(); bufferedWriter.close(); // Adding the sequences file to Hadoop cache URI cFileURI = new URI(inputFilePart.toUri() + "#" + filePartName); DistributedCache.addCacheFile(cFileURI, jobConf); DistributedCache.createSymlink(jobConf); } } }