TestNewInputOutputFormat.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.orc;

import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
import static junit.framework.Assert.assertFalse;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hive.common.util.HiveTestUtils;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;

public class TestNewInputOutputFormat {
  
  Path workDir = new Path(System.getProperty("test.tmp.dir",
    "target" + File.separator + "test" + File.separator + "tmp"));
  
  Configuration conf;
  FileSystem localFs;
  
  @Before
  public void setup() throws Exception {
    conf = new Configuration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "local");
    localFs = FileSystem.get(conf);
  }
  
  @Rule
  public TestName testCaseName = new TestName();
  
  public static class OrcTestMapper1 extends
      Mapper<Object, Writable, Text, Text> {
    @Override
    public void map(Object key, Writable value, Context context)
        throws IOException, InterruptedException {
      context.write(null, new Text(value.toString()));
    }
  }

  @Test
  // Test regular inputformat
  public void testNewInputFormat() throws Exception {
    Job job = new Job(conf, "orc test");
    job.setInputFormatClass(OrcNewInputFormat.class);
    job.setJarByClass(TestNewInputOutputFormat.class);
    job.setMapperClass(OrcTestMapper1.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job,
        new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc")));
    Path outputPath = new Path(workDir,
        "TestOrcFile." + testCaseName.getMethodName() + ".txt");
    localFs.delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);
    boolean result = job.waitForCompletion(true);
    assertTrue(result);
    Path outputFilePath = new Path(outputPath, "part-m-00000");

    assertTrue(localFs.exists(outputFilePath));
    BufferedReader reader = new BufferedReader(
        new InputStreamReader(localFs.open(outputFilePath)));
    int count=0;
    String line;
    String lastLine=null;
    while ((line=reader.readLine()) != null) {
      count++;
      lastLine = line;
    }
    reader.close();
    assertEquals(count, 7500);
    assertEquals(lastLine, "{true, 100, 2048, 65536," +
        " 9223372036854775807, 2.0, -5.0" + 
        ", , bye, {[{1, bye}, {2, sigh}]}, [{100000000, cat}," +
        " {-100000, in}, {1234, hat}]," +
        " {chani={5, chani}, mauddib={1, mauddib}}," +
        " 2000-03-12 15:00:01, 12345678.6547457}");
    localFs.delete(outputPath, true);
  }
  
  public static class OrcTestMapper2 extends Mapper<Object, Text, Object, Writable> {
    private final TypeInfo typeInfo = TypeInfoUtils
        .getTypeInfoFromTypeString("struct<a:int,b:string>");
    private final ObjectInspector oip = TypeInfoUtils
        .getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
    private final OrcSerde serde = new OrcSerde();
    private Writable row;
    @Override
    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(",");
      List<Object> struct = new ArrayList<Object>(2);
      struct.add(0, Integer.parseInt(items[0]));
      struct.add(1, items[1]);
      row = serde.serialize(struct, oip);
      context.write(null, row);
    }
  }
  
  @Test
  //Test regular outputformat
  public void testNewOutputFormat() throws Exception {
    int rownum=1000;
    
    Path inputPath = new Path(workDir, "TestOrcFile." +
        testCaseName.getMethodName() + ".txt");
    Path outputPath = new Path(workDir, "TestOrcFile." +
        testCaseName.getMethodName() + ".orc");
    localFs.delete(outputPath, true);
    PrintWriter pw = new PrintWriter(
        new OutputStreamWriter(localFs.create(inputPath)));
    Random r = new Random(1000L);
    boolean firstRow = true;
    int firstIntValue = 0;
    String firstStringValue = null;
    for (int i=0;i<rownum;i++) {
      int intValue = r.nextInt();
      String stringValue = UUID.randomUUID().toString();
      if (firstRow) {
        firstRow = false;
        firstIntValue = intValue;
        firstStringValue = stringValue;
      }
      pw.println(intValue + "," + stringValue);
    }
    pw.close();

    Job job = new Job(conf, "orc test");
    job.setOutputFormatClass(OrcNewOutputFormat.class);
    job.setJarByClass(TestNewInputOutputFormat.class);
    job.setMapperClass(OrcTestMapper2.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Writable.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    boolean result = job.waitForCompletion(true);
    assertTrue(result);
    
    Path outputFilePath = new Path(outputPath, "part-m-00000");
    assertTrue(localFs.exists(outputFilePath));
    Reader reader = OrcFile.createReader(outputFilePath,
        OrcFile.readerOptions(conf).filesystem(localFs));
    assertTrue(reader.getNumberOfRows() == rownum);
    assertEquals(reader.getCompression(), CompressionKind.ZLIB);
    StructObjectInspector soi =
        (StructObjectInspector)reader.getObjectInspector();
    StructTypeInfo ti =
        (StructTypeInfo)TypeInfoUtils.getTypeInfoFromObjectInspector(soi);
    assertEquals(((PrimitiveTypeInfo)ti.getAllStructFieldTypeInfos().get(0))
        .getPrimitiveCategory(),
        PrimitiveObjectInspector.PrimitiveCategory.INT);
    assertEquals(((PrimitiveTypeInfo)ti.getAllStructFieldTypeInfos().get(1))
        .getPrimitiveCategory(),
        PrimitiveObjectInspector.PrimitiveCategory.STRING);
    
    RecordReader rows = reader.rows();
    Object row = rows.next(null);
    
    IntWritable intWritable = (IntWritable)soi.getStructFieldData(row,
        soi.getAllStructFieldRefs().get(0));
    Text text = (Text)soi.getStructFieldData(row,
        soi.getAllStructFieldRefs().get(1));
    
    assertEquals(intWritable.get(), firstIntValue);
    assertEquals(text.toString(), firstStringValue);
    
    localFs.delete(outputPath, true);
  }
  
  @Test
  //Test outputformat with compression
  public void testNewOutputFormatWithCompression() throws Exception {
    conf.set("hive.exec.orc.default.compress", "SNAPPY");
    
    Path inputPath = new Path(workDir, "TestOrcFile." +
        testCaseName.getMethodName() + ".txt");
    Path outputPath = new Path(workDir, "TestOrcFile." +
        testCaseName.getMethodName() + ".orc");
    localFs.delete(outputPath, true);
    PrintWriter pw = new PrintWriter(
        new OutputStreamWriter(localFs.create(inputPath)));
    pw.println("1,hello");
    pw.println("2,world");
    pw.close();

    Job job = new Job(conf, "orc test");
    job.setOutputFormatClass(OrcNewOutputFormat.class);
    job.setJarByClass(TestNewInputOutputFormat.class);
    job.setMapperClass(OrcTestMapper2.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(OrcSerdeRow.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    boolean result = job.waitForCompletion(true);
    assertTrue(result);
    
    Path outputFilePath = new Path(outputPath, "part-m-00000");
    Reader reader = OrcFile.createReader(outputFilePath,
        OrcFile.readerOptions(conf).filesystem(localFs));
    assertEquals(reader.getCompression(), CompressionKind.SNAPPY);
    
    localFs.delete(outputPath, true);
  }
  
  public static class OrcTestMapper3 extends
      Mapper<Object, Text, IntWritable, Text> {
    @Override
    public void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      String items[] = value.toString().split("\\s+");
      context.write(new IntWritable(items.length), value);
    }
  }

  public static class OrcTestReducer3 extends
      Reducer<IntWritable, Text, NullWritable, Writable> {
    final static TypeInfo typeInfo =
        TypeInfoUtils.getTypeInfoFromTypeString(
        "struct<length:int,count:int,list:array" +
        "<struct<lastword:string,lastwordlength:int>>," +
        "wordcounts:map<string,int>>");
    private final ObjectInspector oip =
        TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
    private final OrcSerde serde = new OrcSerde();
    private Writable row;

    @Override
    public void reduce(IntWritable key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
      List<String> lastwords = new ArrayList<String>();
      Map<String, Integer> wordCounts = new HashMap<String, Integer>();
      int count = 0;
      for (Text val : values) {
        String[] items = val.toString().toLowerCase().split("\\s+");
        lastwords.add(items[items.length-1]);
        for (String item : items) {
          if (wordCounts.containsKey(item)) {
            wordCounts.put(item, wordCounts.get(item)+1);
          } else {
            wordCounts.put(item, 1);
          }
        }
        count++;
      }
      List<Object> struct = new ArrayList<Object>(4);
      struct.add(0, key.get());
      struct.add(1, count);
      List<List<Object>> lastWordInfoList = new ArrayList<List<Object>>();
      Collections.sort(lastwords);
      for (String word : lastwords) {
        List<Object> info = new ArrayList<Object>(2);
        info.add(0, word);
        info.add(1, word.length());
        lastWordInfoList.add(info);
      }
      struct.add(2, lastWordInfoList);
      struct.add(3, wordCounts);
      row = serde.serialize(struct, oip);
      context.write(NullWritable.get(), row);
    }
  }
  
  @SuppressWarnings("unchecked")
  @Test
  //Test outputformat with complex data type, and with reduce
  public void testNewOutputFormatComplex() throws Exception {
    Path inputPath = new Path(workDir, "TestOrcFile." +
        testCaseName.getMethodName() + ".txt");
    Path outputPath = new Path(workDir, "TestOrcFile." +
        testCaseName.getMethodName() + ".orc");
    localFs.delete(outputPath, true);
    PrintWriter pw = new PrintWriter(
        new OutputStreamWriter(localFs.create(inputPath)));
    pw.println("I have eaten");
    pw.println("the plums");
    pw.println("that were in");
    pw.println("the icebox");
    pw.println("and which");
    pw.println("you were probably");
    pw.println("saving");
    pw.println("for breakfast");
    pw.println("Forgive me");
    pw.println("they were delicious");
    pw.println("so sweet");
    pw.println("and so cold");
    pw.close();

    Job job = new Job(conf, "orc test");
    job.setOutputFormatClass(OrcNewOutputFormat.class);
    job.setJarByClass(TestNewInputOutputFormat.class);
    job.setMapperClass(OrcTestMapper3.class);
    job.setReducerClass(OrcTestReducer3.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(OrcSerdeRow.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    boolean result = job.waitForCompletion(true);
    assertTrue(result);
    
    Path outputFilePath = new Path(outputPath, "part-r-00000");
    Reader reader = OrcFile.createReader(outputFilePath,
        OrcFile.readerOptions(conf).filesystem(localFs));
    
    RecordReader rows = reader.rows();
    ObjectInspector orcOi = reader.getObjectInspector();
    ObjectInspector stoi = TypeInfoUtils
        .getStandardJavaObjectInspectorFromTypeInfo(OrcTestReducer3.typeInfo);
    ObjectInspectorConverters.Converter converter = ObjectInspectorConverters
        .getConverter(orcOi, stoi);
    
    Object row = rows.next(null);
    List<Object> converted = (List<Object>)converter.convert(row);
    assertEquals(1, converted.get(0));
    assertEquals(1, converted.get(1));
    List<Object> list = (List<Object>)converted.get(2);
    assertEquals(list.size(), 1);
    assertEquals("saving", ((List<Object>)list.get(0)).get(0));
    assertEquals(6, ((List<Object>)list.get(0)).get(1));
    Map<String, Integer> map = (Map<String, Integer>)converted.get(3);
    assertEquals(map.size(), 1);
    assertEquals(map.get("saving"), new Integer(1));
    
    row = rows.next(null);
    converted = (List<Object>)converter.convert(row);
    assertEquals(2, converted.get(0));
    assertEquals(6, converted.get(1));
    list = (List<Object>)converted.get(2);
    assertEquals(list.size(), 6);
    assertEquals("breakfast", ((List<Object>)list.get(0)).get(0));
    assertEquals(9, ((List<Object>)list.get(0)).get(1));
    map = (Map<String, Integer>)converted.get(3);
    assertEquals(map.size(), 11);
    assertEquals(map.get("the"), new Integer(2));
    
    row = rows.next(null);
    converted = (List<Object>)converter.convert(row);
    assertEquals(3, converted.get(0));
    assertEquals(5, converted.get(1));
    list = (List<Object>)converted.get(2);
    assertEquals(list.size(), 5);
    assertEquals("cold", ((List<Object>)list.get(0)).get(0));
    assertEquals(4, ((List<Object>)list.get(0)).get(1));
    map = (Map<String, Integer>)converted.get(3);
    assertEquals(map.size(), 13);
    assertEquals(map.get("were"), new Integer(3));
    
    assertFalse(rows.hasNext());
    
    localFs.delete(outputPath, true);
  }
  
  @Test
  // Test inputformat with column prune
  public void testNewInputFormatPruning() throws Exception {
    conf.set("hive.io.file.read.all.columns", "false");
    conf.set("hive.io.file.readcolumn.ids", "1,3");
    Job job = new Job(conf, "orc test");
    job.setInputFormatClass(OrcNewInputFormat.class);
    job.setJarByClass(TestNewInputOutputFormat.class);
    job.setMapperClass(OrcTestMapper1.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(HiveTestUtils
        .getFileFromClasspath("orc-file-11-format.orc")));
    Path outputPath = new Path(workDir, "TestOrcFile." +
        testCaseName.getMethodName() + ".txt");
    localFs.delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);
    boolean result = job.waitForCompletion(true);
    assertTrue(result);
    Path outputFilePath = new Path(outputPath, "part-m-00000");

    BufferedReader reader = new BufferedReader(
        new InputStreamReader(localFs.open(outputFilePath)));
    String line=reader.readLine();
    
    assertEquals(line, "{null, 1, null, 65536, null, null, null, " +
        "null, null, null, null, null, null, null}");

    localFs.delete(outputPath, true);
  }
}