ReadSupport.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop.api;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;

import parquet.io.api.RecordMaterializer;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;

/**
 * Abstraction used by the {@link parquet.hadoop.ParquetInputFormat} to materialize records
 *
 * @param <T> the type of the materialized record
 * @author Julien Le Dem
 */
abstract public class ReadSupport<T> {

    /**
     * configuration key for a parquet read projection schema
     */
    public static final String PARQUET_READ_SCHEMA = "parquet.read.schema";

    /**
     * attempts to validate and construct a {@link MessageType} from a read projection schema
     *
     * @param fileMessageType         the typed schema of the source
     * @param partialReadSchemaString the requested projection schema
     * @return the typed schema that should be used to read
     */
    public static MessageType getSchemaForRead(MessageType fileMessageType, String partialReadSchemaString) {
        if (partialReadSchemaString == null)
            return fileMessageType;
        MessageType requestedMessageType = MessageTypeParser.parseMessageType(partialReadSchemaString);
        return getSchemaForRead(fileMessageType, requestedMessageType);
    }

    public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) {
        fileMessageType.checkContains(projectedMessageType);
        return projectedMessageType;
    }

    /**
     * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
     *
     * @param configuration    the job configuration
     * @param keyValueMetaData the app specific metadata from the file
     * @param fileSchema       the schema of the file
     * @return the readContext that defines how to read the file
     * @deprecated override {@link ReadSupport#init(InitContext)} instead
     */
    @Deprecated
    public ReadContext init(
            Configuration configuration,
            Map<String, String> keyValueMetaData,
            MessageType fileSchema) {
        throw new UnsupportedOperationException("Override init(InitContext)");
    }

    /**
     * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
     *
     * @param context the initialisation context
     * @return the readContext that defines how to read the file
     */
    public ReadContext init(InitContext context) {
        return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema());
    }

    /**
     * called in {@link org.apache.hadoop.mapreduce.RecordReader#initialize(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)} in the back end
     * the returned RecordMaterializer will materialize the records and add them to the destination
     *
     * @param configuration    the job configuration
     * @param keyValueMetaData the app specific metadata from the file
     * @param fileSchema       the schema of the file
     * @param readContext      returned by the init method
     * @return the recordMaterializer that will materialize the records
     */
    abstract public RecordMaterializer<T> prepareForRead(
            Configuration configuration,
            Map<String, String> keyValueMetaData,
            MessageType fileSchema,
            ReadContext readContext);

    /**
     * information to read the file
     *
     * @author Julien Le Dem
     */
    public static final class ReadContext {
        private final MessageType requestedSchema;
        private final Map<String, String> readSupportMetadata;

        /**
         * @param requestedSchema the schema requested by the user. Can not be null.
         */
        public ReadContext(MessageType requestedSchema) {
            this(requestedSchema, null);
        }

        /**
         * @param requestedSchema     the schema requested by the user. Can not be null.
         * @param readSupportMetadata metadata specific to the ReadSupport implementation. Will be available in the prepareForRead phase.
         */
        public ReadContext(MessageType requestedSchema, Map<String, String> readSupportMetadata) {
            super();
            if (requestedSchema == null) {
                throw new NullPointerException("requestedSchema");
            }
            this.requestedSchema = requestedSchema;
            this.readSupportMetadata = readSupportMetadata;
        }

        /**
         * @return the schema of the file
         */
        public MessageType getRequestedSchema() {
            return requestedSchema;
        }

        /**
         * @return metadata specific to the ReadSupport implementation
         */
        public Map<String, String> getReadSupportMetadata() {
            return readSupportMetadata;
        }
    }
}