package skywriting.examples.terasort; /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import skywriting.examples.grep.Text; /** * An input format that reads the first 10 characters of each line as the key * and the rest of the line as the value. Both key and value are represented * as Text. */ public class TeraInputFormat { static final String PARTITION_FILENAME = "_partition.lst"; static final String SAMPLE_SIZE = "terasort.partitions.sample"; //private static JobConf lastConf = null; //private static InputSplit[] lastResult = null; static class TextSampler implements IndexedSortable { private ArrayList<Text> records = new ArrayList<Text>(); public int compare(int i, int j) { Text left = records.get(i); Text right = records.get(j); return left.compareTo(right); } public void swap(int i, int j) { Text left = records.get(i); Text right = records.get(j); records.set(j, left); records.set(i, right); } public void addKey(Text key) { records.add(new Text(key)); } /** * Find the split points for a given sample. The sample keys are sorted * and down sampled to find even split points for the partitions. The * returned keys should be the start of their respective partitions. * @param numPartitions the desired number of partitions * @return an array of size numPartitions - 1 that holds the split points */ Text[] createPartitions(int numPartitions) { int numRecords = records.size(); System.out.println("Making " + numPartitions + " from " + numRecords + " records"); if (numPartitions > numRecords) { throw new IllegalArgumentException ("Requested more partitions than input keys (" + numPartitions + " > " + numRecords + ")"); } new QuickSort().sort(this, 0, records.size()); float stepSize = numRecords / (float) numPartitions; System.out.println("Step size is " + stepSize); Text[] result = new Text[numPartitions-1]; for(int i=1; i < numPartitions; ++i) { result[i-1] = records.get(Math.round(stepSize * i)); } return result; } } /** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(OutputStream partFile, InputStream inputFile, int inputBytes, int partitions) throws IOException { TeraInputFormat inFormat = new TeraInputFormat(); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); long sampleSize = 100000; //conf.getLong(SAMPLE_SIZE, 100000); //InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); // I assume you've already split the input data, feeding each split to a mapper // and the whole thing to me for sampling. int samples = Math.min(10, partitions); int filePosition = 0; int bytesPerSample = inputBytes / samples; long recordsPerSample = sampleSize / samples; //int sampleStep = splits.length / samples; long records = 0; // take N samples from different parts of the input for(int i=0; i < samples; ++i) { if(filePosition < (i * bytesPerSample)) { inputFile.skip((i * bytesPerSample) - filePosition); } RecordReader<Text,Text> reader = inFormat.getRecordReader(inputFile, (inputBytes / samples)); while (reader.next(key, value)) { sampler.addKey(key); records += 1; filePosition += (key.getLength() + value.getLength() + 3); if ((i+1) * recordsPerSample <= records) { break; } } } /*FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); }*/ /*SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class);*/ /*NullWritable nullValue = NullWritable.get();*/ DataOutputStream dos = new DataOutputStream(partFile); for(Text split : sampler.createPartitions(partitions)) { System.out.println("Writing a partition"); split.write(dos); } dos.close(); } static class TeraRecordReader implements RecordReader<Text,Text> { private LineRecordReader in; private LongWritable junk = new LongWritable(); private Text line = new Text(); private static int KEY_LENGTH = 10; public TeraRecordReader(InputStream input, int startByte, int endByte) throws IOException { in = new LineRecordReader(input, startByte, endByte); } public TeraRecordReader(InputStream input) throws IOException { this(input, 0, Integer.MAX_VALUE); } public void close() throws IOException { in.close(); } public Text createKey() { return new Text(); } public Text createValue() { return new Text(); } public long getPos() throws IOException { return in.getPos(); } public float getProgress() throws IOException { return in.getProgress(); } public boolean next(Text key, Text value) throws IOException { if (in.next(junk, line)) { if (line.getLength() < KEY_LENGTH) { key.set(line); value.clear(); } else { byte[] bytes = line.getBytes(); key.set(bytes, 0, KEY_LENGTH); value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH); } return true; } else { return false; } } } public RecordReader<Text, Text> getRecordReader(InputStream input, int length) throws IOException { return new TeraRecordReader(input, 1, length + 1); // Hack to line-align } }