TsvSort.java example

Explorer

seal-master
- src
  - it
    - crs4
      - seal
        common
        AbstractSamMapping.java
        AbstractTaggedMapping.java
        AlignFlags.java
        AlignOp.java
        BamInputFormat.java
        BwaRefAnnotation.java
        ClusterUtils.java
        ConfigFileParser.java
        ContextAdapter.java
        CutString.java
        CutText.java
        FormatException.java
        FormatNameMap.java
        GroupByLocationComparator.java
        IMRContext.java
        LineReader.java
        MdOp.java
        OutputStreamFactory.java
        ReadPair.java
        SamInputFormat.java
        SealToolParser.java
        SealToolRunner.java
        SequenceId.java
        SequenceIdLocationPartitioner.java
        TestContext.java
        TextSamMapping.java
        UnknownItemException.java
        Utils.java
        WritableMapping.java
        demux
        BarcodeLookup.java
        Demux.java
        DemuxMapper.java
        DemuxOptionParser.java
        DemuxOutputFormat.java
        DemuxReducer.java
        DemuxUtils.java
        SampleSheet.java
        TwoOneThreeSortComparator.java
        prq
        PairReadsQSeq.java
        PairReadsQSeqMapper.java
        PairReadsQSeqReducer.java
        PrqOptionParser.java
        PrqOutputFormat.java
        read_sort
        FastaChecksummer.java
        MergeAlignments.java
        ReadSort.java
        ReadSortOptionParser.java
        recab
        ArrayListVariantTable.java
        ArrayVariantTable.java
        BytePacking.java
        Covariate.java
        CycleCovariate.java
        DinucCovariate.java
        HashSetVariantTable.java
        ObservationCount.java
        PackedSequence.java
        QualityCovariate.java
        ReadGroupCovariate.java
        RecabTable.java
        RecabTableCombiner.java
        RecabTableMapper.java
        RecabTableOptionParser.java
        RecabTableReducer.java
        RodFileVariantReader.java
        VariantReader.java
        VariantRegion.java
        VariantTable.java
        VcfVariantReader.java
        tsv_sort
        TextSampler.java
        TextValueOutputFormat.java
        TsvInputFormat.java
        TsvSort.java
        TsvSortOptionParser.java
        usort
        USort.java
        USortOptionParser.java
- tests
  - it
    - crs4
      - seal
        common
        TestAbstractSamMapping.java
        TestAlignFlags.java
        TestAlignOp.java
        TestBamInputFormat.java
        TestBwaRefAnnotation.java
        TestConfigFileParser.java
        TestCutString.java
        TestCutText.java
        TestGroupByLocationComparator.java
        TestLineReader.java
        TestMdOp.java
        TestSamInputFormat.java
        TestSealToolParser.java
        TestSequenceId.java
        TestSequenceIdLocationPartitioner.java
        TestTextSamMapping.java
        TestUtils.java
        TestWritableMapping.java
        demux
        TestBarcodeLookup.java
        TestDemuxMapper.java
        TestDemuxReducer.java
        TestSampleSheet.java
        TestTwoOneThreeSortComparator.java
        prq
        TestPairReadsQseqMapper.java
        TestPairReadsQseqReducer.java
        TestPrqOutputFormat.java
        read_sort
        TestFastaChecksummer.java
        recab
        BenchmarkVariantTable.java
        TestArrayListVariantTable.java
        TestArrayVariantTable.java
        TestBytePacking.java
        TestCycleCovariate.java
        TestDinucCovariate.java
        TestHashSetVariantTable.java
        TestObservationCount.java
        TestPackedSequence.java
        TestQualityCovariate.java
        TestReadGroupCovariate.java
        TestRecabTableCombiner.java
        TestRecabTableMapper.java
        TestRecabTableReducer.java
        TestRodFileVariantReader.java
        TestVcfVariantReader.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// based on TeraSort from the Hadoop examples

package it.crs4.seal.tsv_sort;

import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * Generates the sampled split points, launches the job, and waits for it to
 * finish.
 */
public class TsvSort extends Configured implements Tool {
	private static final Log LOG = LogFactory.getLog(TsvSort.class);

	static final String PARTITION_SYMLINK = "_partition.lst";

	/**
	 * A partitioner that splits text keys into roughly equal partitions
	 * in a global sorted order.
	 */
	static class TotalOrderPartitioner extends Partitioner<Text,Text> implements Configurable {
		private TrieNode trie;
		private Text[] splitPoints;
		private Configuration conf;

		/**
		 * A generic trie node
		 */
		static abstract class TrieNode {
			private int level;
			TrieNode(int level) {
		 		this.level = level;
			}
			abstract int findPartition(Text key);
			abstract void print(PrintStream strm) throws IOException;
			int getLevel() {
				return level;
			}
		}

		/**
		 * An inner trie node that contains 256 children based on the next
		 * character.
		 */
		static class InnerTrieNode extends TrieNode {
			private TrieNode[] child = new TrieNode[256];

			InnerTrieNode(int level) {
				super(level);
			}
			int findPartition(Text key) {
				int level = getLevel();
				if (key.getLength() <= level) {
					return child[0].findPartition(key);
				}
				return child[key.getBytes()[level]].findPartition(key);
			}
			void setChild(int idx, TrieNode child) {
				this.child[idx] = child;
			}
			void print(PrintStream strm) throws IOException {
				for(int ch=0; ch < 255; ++ch) {
					for(int i = 0; i < 2*getLevel(); ++i) {
						strm.print(' ');
					}
					strm.print(ch);
					strm.println(" ->");
					if (child[ch] != null) {
						child[ch].print(strm);
					}
				}
			}
		}

		/**
		 * A leaf trie node that does string compares to figure out where the given
		 * key belongs between lower..upper.
		 */
		static class LeafTrieNode extends TrieNode {
			int lower;
			int upper;
			Text[] splitPoints;
			LeafTrieNode(int level, Text[] splitPoints, int lower, int upper) {
				super(level);
				this.splitPoints = splitPoints;
				this.lower = lower;
				this.upper = upper;
			}
			int findPartition(Text key) {
				for(int i=lower; i<upper; ++i) {
					if (splitPoints[i].compareTo(key) >= 0) {
						return i;
					}
				}
				return upper;
			}
			void print(PrintStream strm) throws IOException {
				for(int i = 0; i < 2*getLevel(); ++i) {
					strm.print(' ');
				}
				strm.print(lower);
				strm.print(", ");
				strm.println(upper);
			}
		}


		/**
		 * Read the cut points from the given sequence file.
		 * @param fs the file system
		 * @param p the path to read
		 * @param conf the config
		 * @return the strings to split the partitions on
		 * @throws IOException
		 */
		private static Text[] readPartitions(FileSystem fs, Path p, Configuration conf) throws IOException {
			SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, conf);
			List<Text> parts = new ArrayList<Text>();
			Text key = new Text();
			NullWritable value = NullWritable.get();
			while (reader.next(key, value)) {
				parts.add(key);
				key = new Text();
			}
			reader.close();
			return parts.toArray(new Text[parts.size()]);
		}

		/**
		 * Given a sorted set of cut points, build a trie that will find the correct
		 * partition quickly.
		 * @param splits the list of cut points
		 * @param lower the lower bound of partitions 0..numPartitions-1
		 * @param upper the upper bound of partitions 0..numPartitions-1
		 * @param prefix the prefix that we have already checked against
		 * @param maxDepth the maximum depth we will build a trie for
		 * @return the trie node that will divide the splits correctly
		 */
		private static TrieNode buildTrie(Text[] splits, int lower, int upper,
		                                  Text prefix, int maxDepth) {
			int depth = prefix.getLength();
			if (depth >= maxDepth || lower == upper) {
				return new LeafTrieNode(depth, splits, lower, upper);
			}
			InnerTrieNode result = new InnerTrieNode(depth);
			Text trial = new Text(prefix);
			// append an extra byte on to the prefix
			trial.append(new byte[1], 0, 1);
			int currentBound = lower;
			for(int ch = 0; ch < 255; ++ch) {
				trial.getBytes()[depth] = (byte) (ch + 1);
				lower = currentBound;
				while (currentBound < upper) {
					if (splits[currentBound].compareTo(trial) >= 0) {
						break;
					}
					currentBound += 1;
				}
				trial.getBytes()[depth] = (byte) ch;
				result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth);
			}
			// pick up the rest
			trial.getBytes()[depth] = 127;
			result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth);
			return result;
		}

		public Configuration getConf() {
			return conf;
		}

		public void setConf(Configuration conf) {
			this.conf = conf;
			try {
				FileSystem fs = FileSystem.getLocal(conf);
				Path partFile = new Path(TsvSort.PARTITION_SYMLINK);
				splitPoints = readPartitions(fs, partFile, conf);
				trie = buildTrie(splitPoints, 0, splitPoints.length, new Text(), 2);
			} catch (IOException ie) {
				throw new IllegalArgumentException("can't read partitions file", ie);
			}
		}

		public TotalOrderPartitioner() {
		}

		@Override
		public int getPartition(Text key, Text value, int numPartitions) {
			return trie.findPartition(key);
		}
	}

	public int run(String[] args) throws Exception {
		LOG.info("starting");

		TsvSortOptionParser parser = new TsvSortOptionParser();
		parser.parse(getConf(), args);

		LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

		Job job = new Job(getConf());

		job.setJobName("TsvSort " + parser.getInputPaths().get(0));
		job.setJarByClass(TsvSort.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setInputFormatClass(TsvInputFormat.class);
		job.setOutputFormatClass(TextValueOutputFormat.class);
		job.setPartitionerClass(TotalOrderPartitioner.class);

		// output path
		FileOutputFormat.setOutputPath(job, parser.getOutputPath());

		FileSystem fs = parser.getOutputPath().getFileSystem(job.getConfiguration());
		/*
		 *
		 * Pick a random name for the partition file in the same directory as the
		 * output path.  So, TsvSort /user/me/input /user/me/output
		 * results in the partition file being placed in /user/me/_partition.lst.12340921387402174
		 *
		 * Why not place it directly in the input path?
		 *
		 *   We wouldn't be able to run two sorts on the same data at the same time.
		 *   We've received complaints about this in the past, so it has been a
		 *   limit in practice.
		 *
		 * Why not place it directly in the output path?
		 *
		 *   We'd have to create the output path before the output format did.
		 *   For this to work we'd have to disable the FileOutputFormat's default check
		 *   that verifies that the output directory doesn't exist.  This means that we'd
		 *   need some other way to ensure that we're not writing to the same path where
		 *   some other job wrote.
		 */
		Path partitionFile;
		Random rnd = new Random();
		do {
			partitionFile = new Path(parser.getOutputPath().getParent(), String.format("_partition.lst.%012d", Math.abs(rnd.nextLong())));
		} while (fs.exists(partitionFile)); // this is still subject to a race condition between it and another instance of this program
		partitionFile = partitionFile.makeQualified(fs);
		LOG.info("partition file path: " + partitionFile);

		URI partitionUri = new URI(partitionFile.toString() + "#" + PARTITION_SYMLINK);
		LOG.debug("partitionUri for distributed cache: " + partitionUri);

		// input paths
		for (Path p: parser.getInputPaths())
			TsvInputFormat.addInputPath(job, p);

		LOG.info("sampling input");
		TextSampler.writePartitionFile(new TsvInputFormat(), job, partitionFile);
		LOG.info("created partitions");
		try {
			DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
			DistributedCache.createSymlink(job.getConfiguration());

			int retcode = job.waitForCompletion(true) ? 0 : 1;
			LOG.info("done");
			return retcode;
		}
		finally {
			LOG.debug("deleting partition file " + partitionFile);
			fs.delete(partitionFile, false);
		}
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new TsvSort(), args);
		System.exit(res);
	}
}