TestParquetRowGroupFilter.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.io.parquet;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.TimeZone;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import com.google.common.collect.Lists;

public class TestParquetRowGroupFilter extends AbstractTestParquetDirect {

  JobConf conf;
  String columnNames;
  String columnTypes;

  @Before
  public void initConf() throws Exception {
    conf = new JobConf();

  }

  @Test
  public void testRowGroupFilterTakeEffect() throws Exception {
    // define schema
    columnNames = "intCol";
    columnTypes = "int";
    StructObjectInspector inspector = getObjectInspector(columnNames, columnTypes);
    MessageType fileSchema = MessageTypeParser.parseMessageType(
        "message hive_schema {\n"
            + "  optional int32 intCol;\n"
            + "}\n"
    );

    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "intCol");
    conf.set("columns", "intCol");
    conf.set("columns.types", "int");

    // create Parquet file with specific data
    Path testPath = writeDirect("RowGroupFilterTakeEffect", fileSchema,
        new DirectWriter() {
          @Override
          public void write(RecordConsumer consumer) {
            for(int i = 0; i < 100; i++) {
              consumer.startMessage();
              consumer.startField("int", 0);
              consumer.addInteger(i);
              consumer.endField("int", 0);
              consumer.endMessage();
            }
          }
        });

    // > 50
    GenericUDF udf = new GenericUDFOPGreaterThan();
    List<ExprNodeDesc> children = Lists.newArrayList();
    ExprNodeColumnDesc columnDesc = new ExprNodeColumnDesc(Integer.class, "intCol", "T", false);
    ExprNodeConstantDesc constantDesc = new ExprNodeConstantDesc(50);
    children.add(columnDesc);
    children.add(constantDesc);
    ExprNodeGenericFuncDesc genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    String searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
    conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID());

    ParquetRecordReaderWrapper recordReader = (ParquetRecordReaderWrapper)
        new MapredParquetInputFormat().getRecordReader(
        new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);

    Assert.assertEquals("row group is not filtered correctly", 1, recordReader.getFiltedBlocks().size());

    // > 100
    constantDesc = new ExprNodeConstantDesc(100);
    children.set(1, constantDesc);
    genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);

    recordReader = (ParquetRecordReaderWrapper)
        new MapredParquetInputFormat().getRecordReader(
            new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);

    Assert.assertEquals("row group is not filtered correctly", 0, recordReader.getFiltedBlocks().size());
  }

  private ArrayWritableObjectInspector getObjectInspector(final String columnNames, final String columnTypes) {
    List<TypeInfo> columnTypeList = createHiveTypeInfoFrom(columnTypes);
    List<String> columnNameList = createHiveColumnsFrom(columnNames);
    StructTypeInfo rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNameList, columnTypeList);

    return new ArrayWritableObjectInspector(rowTypeInfo);
  }

  private List<String> createHiveColumnsFrom(final String columnNamesStr) {
    List<String> columnNames;
    if (columnNamesStr.length() == 0) {
      columnNames = new ArrayList<String>();
    } else {
      columnNames = Arrays.asList(columnNamesStr.split(","));
    }

    return columnNames;
  }

  private List<TypeInfo> createHiveTypeInfoFrom(final String columnsTypeStr) {
    List<TypeInfo> columnTypes;

    if (columnsTypeStr.length() == 0) {
      columnTypes = new ArrayList<TypeInfo>();
    } else {
      columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnsTypeStr);
    }

    return columnTypes;
  }
}