/** * Copyright 2011-2017 Asakusa Framework Team. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.asakusafw.directio.hive.parquet; import java.util.Arrays; import org.apache.hadoop.hive.common.type.HiveBaseChar; import org.apache.hadoop.io.Text; import com.asakusafw.runtime.value.StringOption; import com.asakusafw.runtime.value.ValueOption; import parquet.column.Dictionary; import parquet.io.api.Binary; import parquet.io.api.RecordConsumer; import parquet.schema.OriginalType; import parquet.schema.PrimitiveType.PrimitiveTypeName; import parquet.schema.Type; import parquet.schema.Types; /** * {@link ParquetValueDriver} for character strings with its length limit. * @since 0.7.2 */ public class LimitedStringValueDriver implements ParquetValueDriver { private final int length; /** * Creates a new instance. * @param length the limit length (in Unicode) */ public LimitedStringValueDriver(int length) { this.length = length; } @Override public Type getType(String name) { return Types.optional(PrimitiveTypeName.BINARY) .as(OriginalType.UTF8) .named(name); } @Override public ValueConverter getConverter() { return new ToStringOption(); } @Override public ValueWriter getWriter() { return new FromStringOption(length); } static final class FromStringOption implements ValueWriter { private final int limit; FromStringOption(int length) { this.limit = length; } @Override public void write(Object value, RecordConsumer consumer) { StringOption option = (StringOption) value; Text text = option.get(); byte[] bytes = text.getBytes(); int length = text.getLength(); if (length > limit) { // if byte-length > limit, the string may code-point-count >= limit String stripped = HiveBaseChar.getPaddedValue(text.toString(), limit); consumer.addBinary(Binary.fromString(stripped)); } else { consumer.addBinary(Binary.fromByteArray(bytes, 0, length)); } } } static final class ToStringOption extends ValueConverter { private Text[] dict; private StringOption target; @Override public void set(ValueOption<?> value) { this.target = (StringOption) value; } @Override public boolean hasDictionarySupport() { return true; } @Override public void setDictionary(Dictionary dictionary) { Text[] buf = prepareDictionaryBuffer(dictionary); for (int id = 0, max = dictionary.getMaxId(); id <= max; id++) { Text text = new Text(); Binary binary = dictionary.decodeToBinary(id); setBinary(text, binary); buf[id] = text; } } @SuppressWarnings("deprecation") @Override public void addValueFromDictionary(int dictionaryId) { target.modify(dict[dictionaryId]); } @Override public void addBinary(Binary value) { target.reset(); setBinary(target.get(), value); } private Text[] prepareDictionaryBuffer(Dictionary dictionary) { int size = dictionary.getMaxId() + 1; if (this.dict == null || this.dict.length < size) { int capacity = (int) (size * 1.2) + 1; this.dict = new Text[capacity]; } else { Arrays.fill(this.dict, null); } return this.dict; } private void setBinary(Text text, Binary binary) { // TODO check it length? text.set(binary.getBytes(), 0, binary.length()); } } }