TestInputRecordsSplit.java example

Explorer

KnittingBoar-master
- src
  - main
    - java
      - com
        cloudera
        knittingboar
        App.java
        conf
        cmdline
        DataConverterCmdLineDriver.java
        ModelTesterCmdLineDriver.java
        ModelTrainerCmdLineDriver.java
        io
        InputRecordsSplit.java
        messages
        iterativereduce
        ParameterVector.java
        ParameterVectorUpdatable.java
        metrics
        POLRMetrics.java
        POLRModelTester.java
        records
        CSVBasedDatasetRecordFactory.java
        RCV1RecordFactory.java
        RecordFactory.java
        TwentyNewsgroupsRecordFactory.java
        sgd
        MultinomialLogisticRegressionParameterVectors_deprecated.java
        POLRModelParameters.java
        ParallelOnlineLogisticRegression.java
        iterativereduce
        POLRMasterNode.java
        POLRNodeBase.java
        POLRWorkerNode.java
        utils
        DatasetConverter.java
        Utils.java
        yarn
        avro
        generated
        FileSplit.java
        KnittingBoarService.java
        ProgressReport.java
        ServiceError.java
        StartupConfiguration.java
        WorkerId.java
  - test
    - java
      - com
        cloudera
        knittingboar
        AppTest.java
        conf
        cmdline
        TestDataConverterDriver.java
        TestJobDriver.java
        io
        TestCsvRecordParsing.java
        TestInputRecordsSplit.java
        TestSplitCalcs.java
        TestSplitReset.java
        messages
        TestParameterVector.java
        metrics
        Test20NewsApplyModel.java
        TestRCV1ApplyModel.java
        TestSaveLoadModel.java
        records
        Test20NewsgroupsBookParsing.java
        TestCSVBasedDatasetRecordFactory.java
        TestRCV1RecordFactory.java
        TestTwentyNewsgroupsCustomRecordParseOLRRun.java
        TestTwentyNewsgroupsRecordFactory.java
        sgd
        TestBaseSGD.java
        TestPOLRMasterNode.java
        TestPOLRWorkerNode.java
        TestParallelOnlineLogisticRegression.java
        iterativereduce
        TestKnittingBoar_IRUnitSim.java
        TestPOLRIterativeReduce.java
        olr
        TestBaseOLRTest20Newsgroups.java
        TestBaseOLR_Train20Newsgroups.java
        utils
        DataUtils.java
        TestConvert20NewsTestDataset.java
        TestDatasetConverter.java
        TestRcv1SubsetConversion.java
        TestUtils.java
        TestingUtils.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.knittingboar.io;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import junit.framework.TestCase;

public class TestInputRecordsSplit  extends TestCase {


  private static final Log LOG = LogFactory.getLog(TestInputRecordsSplit.class.getName());

  private static int MAX_LENGTH = 1000;
  
  private static JobConf defaultConf = new JobConf();
  
  private static FileSystem localFs = null; 
  static {
    try {
      defaultConf.set("fs.defaultFS", "file:///");
      localFs = FileSystem.getLocal(defaultConf);
    } catch (IOException e) {
      throw new RuntimeException("init failure", e);
    }
  }
  
  private static Path workDir = new Path(new Path(System.getProperty("test.build.data", "/tmp")), "TestInputRecordsSplit").makeQualified(localFs);  
  
  
  /**
   * create an InputRecordSplit and then read some records
   * 
   * - make sure we maintain split discipline
   * @throws IOException 
   * 
   */
  public void testReadSplitViaInputRecordsSplit() throws IOException {
    
    // InputRecordsSplit(JobConf jobConf, InputSplit split)
    
    // needs to get a jobConf from somewhere, under the hood
    
    // needs a split calculated from the aforementioned jobConf
    
    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit.txt");

    int tmp_file_size = 2000;
    
    long block_size = localFs.getDefaultBlockSize();
    
    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");
    
    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
      for (int i = 0; i < tmp_file_size; i++) {
        writer.write("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
        writer.write("\n");
      }
    } finally {
      writer.close();
    }    
    
    System.out.println( "file write complete, wrote " + tmp_file_size + " recs" );
    
    
    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
 
    System.out.println( "> setting splits for: " + workDir );
    
//    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);


      // try splitting the file in a variety of sizes
      TextInputFormat format = new TextInputFormat();
      format.configure(job);
      LongWritable key = new LongWritable();
      Text value = new Text();

      int numSplits = 1;
      
      InputSplit[] splits = format.getSplits(job, numSplits);
      
      LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
      
      System.out.println( "---- debug splits --------- " );
      
      //InputSplit test_split = null;
      
      int total_read = 0;
      
      for (int x = 0; x < splits.length; x++) {
        
        System.out.println( "> Split [" + x + "]: " + splits[x].getLength()  );
        
        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while ( custom_reader.next(value)) {
          
          count++;
          //
          
        }
        
        System.out.println( "read: " + count + " records for split " + x );
 
        total_read += count;
        
      } // for each split
      
      System.out.println( "--------- total read across all splits: " + total_read );
      
      assertEquals( tmp_file_size, total_read);
      
  }
  
  /*
  public void testRCV1Splits() throws IOException {
    
    String file_rcv1 = "/Users/jpatterson/Downloads/rcv1/rcv1.train.vw";
    
    System.out.println( "testRCV1Splits >> " + file_rcv1 );
    
    JobConf job = new JobConf(defaultConf);
    Path file = new Path(file_rcv1);
    
    
    
    FileInputFormat.setInputPaths(job, file);


    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;
    
    InputSplit[] splits = format.getSplits(job, numSplits);
    
    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
    
    System.out.println( "---- debug splits --------- " );
    
    //InputSplit test_split = null;
    
    int total_read = 0;
    
    for (int x = 0; x < splits.length; x++) {
      
      System.out.println( "> Split [" + x + "]: " + splits[x].toString() + ", len:" + splits[x].getLength()  );
      
      int count = 0;
      InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
      while ( custom_reader.next(value)) {
        
        count++;
        //
        
      }
      
      System.out.println( "read: " + count + " records for split " + x );

      total_read += count;
      
    } // for each split
    
    System.out.println( "total read across all splits: " + total_read );
        
    
  }
  */
  
  
  
  
  
  
  public void testReadSplitViaInputRecordsSplit_SplitReset() throws IOException {
    
    // InputRecordsSplit(JobConf jobConf, InputSplit split)
    
    // needs to get a jobConf from somewhere, under the hood
    
    // needs a split calculated from the aforementioned jobConf
    
    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit_SplitReset");

    int tmp_file_size = 2000;
    
    long block_size = localFs.getDefaultBlockSize();
    
    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");
    
    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
      for (int i = 0; i < tmp_file_size; i++) {
        writer.write("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
        writer.write("\n");
      }
    } finally {
      writer.close();
    }    
    
    System.out.println( "file write complete, wrote " + tmp_file_size + " recs" );
    
    
    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
 
//    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);


      // try splitting the file in a variety of sizes
      TextInputFormat format = new TextInputFormat();
      format.configure(job);
      LongWritable key = new LongWritable();
      Text value = new Text();

      int numSplits = 1;
      
      InputSplit[] splits = format.getSplits(job, numSplits);
      
      LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
      
      System.out.println( "---- testReadSplitViaInputRecordsSplit_SplitReset: debug splits --------- " );
      
      int total_read = 0;
      
        System.out.println( "> Split [0]: " + splits[0].getLength()  );
        
        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);
        while ( custom_reader.next(value)) {
          
          count++;
          
        }
        
        System.out.println( "read: " + count + " records for split " + 0 );

        int count_reset = 0;
        custom_reader.ResetToStartOfSplit();
        while ( custom_reader.next(value)) {
          
          count_reset++;
          
        }
        
        System.out.println( "read: " + count_reset + " records for split after reset " + 0 );
        
        assertEquals( count, count_reset );
        
      
  }
    
  
  
  
  
  
}