TeraInputFormat.java example

Explorer

ciel-java-master
- bindings
  - src
    - main
      - java
        com
        asgow
        ciel
        executor
        Ciel.java
        Java2Executor.java
        SoftCache.java
        io
        CielInputStream.java
        CielOutputStream.java
        references
        CielFuture.java
        CompletedReference.java
        ConcreteReference.java
        DummyWritableReference.java
        FutureReference.java
        JsonReferenceType.java
        Netloc.java
        Reference.java
        StreamReference.java
        SweetheartReference.java
        ValueReference.java
        WritableReference.java
        rpc
        EnvRpcTest.java
        JsonPipeRpc.java
        ReferenceUnavailableException.java
        ShutdownException.java
        WorkerRpc.java
        simple
        JarTaskLoader.java
        Task.java
        tasks
        ConstantNumOutputsTask.java
        FirstClassJavaTask.java
        FirstClassJavaTaskInformation.java
        JsonTaskInformation.java
        SingleOutputTask.java
        StdinoutTaskInformation.java
        TaskInformation.java
- examples
  - Grep
    - src
      - skywriting
        examples
        grep
        AbstractOutputCollector.java
        BinaryComparable.java
        Combiner.java
        DataInputBuffer.java
        GrepMapper.java
        GrepReducer1.java
        GrepReducer2.java
        IdentityCombiner.java
        IncrementerCombiner.java
        IntWritable.java
        OutputCollector.java
        PartialHashOutputCollector.java
        PrintHelper.java
        RawComparator.java
        ReflectionUtils.java
        SetInsertCombiner.java
        SortedPartialHashOutputCollector.java
        Text.java
        Writable.java
        WritableComparable.java
        WritableComparator.java
  - Mandelbrot
    - src
      - java
        skywriting
        examples
        mandelbrot
        Mandelbrot.java
        Stitch.java
  - Pi
    - src
      - java
        skywriting
        examples
        pi
        PiMapper.java
        PiReducer.java
  - TeraSort
    - src
      - skywriting
        examples
        terasort
        BinaryComparable.java
        DataInputBuffer.java
        HeapSort.java
        IndexedSortable.java
        LineReader.java
        LineRecordReader.java
        LongWritable.java
        Merger.java
        PriorityQueue.java
        QuickSort.java
        RawComparator.java
        RecordReader.java
        ReflectionUtils.java
        SWTeraBucketer.java
        SWTeraDriver.java
        SWTeraMerger.java
        SWTeraPreBucketer.java
        SWTeraSampler.java
        TeraInputFormat.java
        Text.java
        TextPairIterator.java
        TotalOrderPartitioner.java
        Writable.java
        WritableComparable.java
        WritableComparator.java
  - WordCount
    - src
      - skywriting
        examples
        wordcount
        BinaryComparable.java
        Combiner.java
        DataInputBuffer.java
        IncrementerCombiner.java
        IntWritable.java
        OutputCollector.java
        PartialHashOutputCollector.java
        PrintHelper.java
        RawComparator.java
        ReflectionUtils.java
        Text.java
        WordCountMapper.java
        WordCountReducer.java
        Writable.java
        WritableComparable.java
        WritableComparator.java
        WritableUtils.java
  - kmeans
    - src
      - java
        skywriting
        examples
        kmeans
        DummyRPC.java
        KMeansDataGenerator.java
        KMeansDataPartitioner.java
        KMeansHead.java
        KMeansInitTask.java
        KMeansMapper.java
        KMeansMapperResult.java
        KMeansReducer.java
  - skydoop
    - src
      - java
        uk
        co
        mrry
        mercator
        mapreduce
        SWInputSplit.java
        SWLineRecordReader.java
        SWLineRecordWriter.java
        SWMapEntryPoint.java
        SWMapperOutputCollector.java
        SWOutputCollector.java
        SWReduceEntryPoint.java
        SWReduceInputMerger.java
  - skyhout
    - src
      - java
        skywriting
        examples
        skyhout
        common
        AbstractOutputCollector.java
        ClosableOutputCollector.java
        Combiner.java
        CombinerReducer.java
        DirectOutputCollector.java
        FakeSeekable.java
        IdentityCombiner.java
        IntArrayWritable.java
        LineRecordFileMapDriver.java
        MapDriver.java
        Mapper.java
        NullCombiner.java
        PartialHashOutputCollector.java
        SequenceFileMapDriver.java
        SequenceFileUtils.java
        SetInsertCombiner.java
        SkyhoutTask.java
        SkywritingTaskFileSystem.java
        SortedInputReduceDriver.java
        SortedInputSequenceFileOutputReduceDriver.java
        SortedInputTextOutputReduceDriver.java
        SortedPartitionedOutputCollector.java
        StringArrayWritable.java
        ZipDriver.java
        Zipper.java
        input
        CachingSequenceFileReader.java
        Reader.java
        SequenceFileReader.java
        VectorInputParserTask.java
        kmeans
        KMeansCombiner.java
        KMeansDataGenerator.java
        KMeansHead.java
        KMeansInitTask.java
        KMeansMapTask.java
        KMeansReduceTask.java
        KMeansReducerCombiner.java
        KMeansSeedGenerator.java
        RandomClusterSequenceFileGenerator.java
        RandomVectorSequenceFileGenerator.java
        linalg
        ConjugateGradientReduceTask.java
        MatrixVectorMultiplyTask.java
        PowerIterationReduceTask.java
        VectorMerger.java
        pagerank
        IntListCombiner.java
        PageRankCombinerReducer.java
        PageRankInitMergeTask.java
        PageRankInitTask.java
        PageRankInitialScoreTask.java
        PageRankReduceTask.java
        PageRankSortMapTask.java
        PageRankSortReduceTask.java
        PageRankZipTask.java
  - src
    - main
      - java
        com
        asgow
        ciel
        examples
        HelloWorld.java
        smithwaterman
        PartitionInputString.java
        SmithWaterman.java
        SmithWatermanBlockTask.java
        SmithWatermanResultTask.java
        ZeroInputStream.java

package skywriting.examples.terasort;
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;

import skywriting.examples.grep.Text;

/**
 * An input format that reads the first 10 characters of each line as the key
 * and the rest of the line as the value. Both key and value are represented
 * as Text.
 */
public class TeraInputFormat {

  static final String PARTITION_FILENAME = "_partition.lst";
  static final String SAMPLE_SIZE = "terasort.partitions.sample";
  //private static JobConf lastConf = null;
  //private static InputSplit[] lastResult = null;

  static class TextSampler implements IndexedSortable {
    private ArrayList<Text> records = new ArrayList<Text>();

    public int compare(int i, int j) {
      Text left = records.get(i);
      Text right = records.get(j);
      return left.compareTo(right);
    }

    public void swap(int i, int j) {
      Text left = records.get(i);
      Text right = records.get(j);
      records.set(j, left);
      records.set(i, right);
    }

    public void addKey(Text key) {
      records.add(new Text(key));
    }

    /**
     * Find the split points for a given sample. The sample keys are sorted
     * and down sampled to find even split points for the partitions. The
     * returned keys should be the start of their respective partitions.
     * @param numPartitions the desired number of partitions
     * @return an array of size numPartitions - 1 that holds the split points
     */
    Text[] createPartitions(int numPartitions) {
      int numRecords = records.size();
      System.out.println("Making " + numPartitions + " from " + numRecords + 
                         " records");
      if (numPartitions > numRecords) {
        throw new IllegalArgumentException
          ("Requested more partitions than input keys (" + numPartitions +
           " > " + numRecords + ")");
      }
      new QuickSort().sort(this, 0, records.size());
      float stepSize = numRecords / (float) numPartitions;
      System.out.println("Step size is " + stepSize);
      Text[] result = new Text[numPartitions-1];
      for(int i=1; i < numPartitions; ++i) {
        result[i-1] = records.get(Math.round(stepSize * i));
      }
      return result;
    }
  }
  
  /**
   * Use the input splits to take samples of the input and generate sample
   * keys. By default reads 100,000 keys from 10 locations in the input, sorts
   * them and picks N-1 keys to generate N equally sized partitions.
   * @param conf the job to sample
   * @param partFile where to write the output file to
   * @throws IOException if something goes wrong
   */
  public static void writePartitionFile(OutputStream partFile, InputStream inputFile, int inputBytes, int partitions) throws IOException {
    TeraInputFormat inFormat = new TeraInputFormat();
    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    long sampleSize = 100000; //conf.getLong(SAMPLE_SIZE, 100000);
    //InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    // I assume you've already split the input data, feeding each split to a mapper
    // and the whole thing to me for sampling.
    int samples = Math.min(10, partitions);
    int filePosition = 0;
    int bytesPerSample = inputBytes / samples;
    long recordsPerSample = sampleSize / samples;
    //int sampleStep = splits.length / samples;
    long records = 0;
    // take N samples from different parts of the input
    for(int i=0; i < samples; ++i) {

    	if(filePosition < (i * bytesPerSample)) {
    		inputFile.skip((i * bytesPerSample) - filePosition);
    	}
    	
      RecordReader<Text,Text> reader = 
        inFormat.getRecordReader(inputFile, (inputBytes / samples));
      while (reader.next(key, value)) {
        sampler.addKey(key);
        records += 1;
        filePosition += (key.getLength() + value.getLength() + 3);
        if ((i+1) * recordsPerSample <= records) {
          break;
        }
      }

    }
    /*FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
      outFs.delete(partFile, false);
    }*/
    /*SequenceFile.Writer writer = 
      SequenceFile.createWriter(outFs, conf, partFile, Text.class, 
                                NullWritable.class);*/
    /*NullWritable nullValue = NullWritable.get();*/
    
    DataOutputStream dos = new DataOutputStream(partFile);
    for(Text split : sampler.createPartitions(partitions)) {
    	System.out.println("Writing a partition");
      split.write(dos);
    }
    dos.close();
  }

  static class TeraRecordReader implements RecordReader<Text,Text> {
    private LineRecordReader in;
    private LongWritable junk = new LongWritable();
    private Text line = new Text();
    private static int KEY_LENGTH = 10;

    public TeraRecordReader(InputStream input, int startByte, int endByte) throws IOException {
      in = new LineRecordReader(input, startByte, endByte);
    }
    
    public TeraRecordReader(InputStream input) throws IOException {
    	this(input, 0, Integer.MAX_VALUE);
    }

    public void close() throws IOException {
      in.close();
    }

    public Text createKey() {
      return new Text();
    }

    public Text createValue() {
      return new Text();
    }

    public long getPos() throws IOException {
      return in.getPos();
    }

    public float getProgress() throws IOException {
      return in.getProgress();
    }

    public boolean next(Text key, Text value) throws IOException {
      if (in.next(junk, line)) {
        if (line.getLength() < KEY_LENGTH) {
          key.set(line);
          value.clear();
        } else {
          byte[] bytes = line.getBytes();
          key.set(bytes, 0, KEY_LENGTH);
          value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH);
        }
        return true;
      } else {
        return false;
      }
    }
  }

  public RecordReader<Text, Text> 
      getRecordReader(InputStream input, int length) throws IOException {
    return new TeraRecordReader(input, 1, length + 1); // Hack to line-align
  }

}