ParquetInputSplit.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ColumnChunkMetaData;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;

/**
 * An input split for the Parquet format
 * It contains the information to read one block of the file.
 * <p/>
 * This class is private to the ParquetInputFormat.
 * Backward compatibility is not maintained.
 *
 * @author Julien Le Dem
 */
@Private
public class ParquetInputSplit extends FileSplit implements Writable {


    private long end;
    private long[] rowGroupOffsets;

    /**
     * Writables must have a parameterless constructor
     */
    public ParquetInputSplit() {
        super(null, 0, 0, new String[0]);
    }

    /**
     * For compatibility only
     * use {@link ParquetInputSplit#ParquetInputSplit(Path, long, long, long, String[], long[])}
     *
     * @param path
     * @param start
     * @param length
     * @param hosts
     * @param blocks
     * @param requestedSchema
     * @param fileSchema
     * @param extraMetadata
     * @param readSupportMetadata
     */
    @Deprecated
    public ParquetInputSplit(
            Path path,
            long start,
            long length,
            String[] hosts,
            List<BlockMetaData> blocks,
            String requestedSchema,
            String fileSchema,
            Map<String, String> extraMetadata,
            Map<String, String> readSupportMetadata) {
        this(path, start, length, end(blocks, requestedSchema), hosts, offsets(blocks));
    }

    private static long end(List<BlockMetaData> blocks, String requestedSchema) {
        MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
        long length = 0;

        for (BlockMetaData block : blocks) {
            List<ColumnChunkMetaData> columns = block.getColumns();
            for (ColumnChunkMetaData column : columns) {
                if (requested.containsPath(column.getPath().toArray())) {
                    length += column.getTotalSize();
                }
            }
        }
        return length;
    }

    private static long[] offsets(List<BlockMetaData> blocks) {
        long[] offsets = new long[blocks.size()];
        for (int i = 0; i < offsets.length; i++) {
            offsets[i] = blocks.get(i).getStartingPos();
        }
        return offsets;
    }

    /**
     * @return the block meta data
     * @deprecated the file footer is no longer read before creating input splits
     */
    @Deprecated
    public List<BlockMetaData> getBlocks() {
        throw new UnsupportedOperationException(
                "Splits no longer have row group metadata, see PARQUET-234");
    }

    /**
     * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
     *
     * @param split a mapreduce FileSplit
     * @return a ParquetInputSplit
     * @throws IOException
     */
    static ParquetInputSplit from(FileSplit split) throws IOException {
        return new ParquetInputSplit(split.getPath(),
                split.getStart(), split.getStart() + split.getLength(),
                split.getLength(), split.getLocations(), null);
    }

    /**
     * Builds a {@code ParquetInputSplit} from a mapred
     * {@link org.apache.hadoop.mapred.FileSplit}.
     *
     * @param split a mapreduce FileSplit
     * @return a ParquetInputSplit
     * @throws IOException
     */
    static ParquetInputSplit from(org.apache.hadoop.mapred.FileSplit split) throws IOException {
        return new ParquetInputSplit(split.getPath(),
                split.getStart(), split.getStart() + split.getLength(),
                split.getLength(), split.getLocations(), null);
    }

    /**
     * @param file            the path of the file for that split
     * @param start           the start offset in the file
     * @param end             the end offset in the file
     * @param length          the actual size in bytes that we expect to read
     * @param hosts           the hosts with the replicas of this data
     * @param rowGroupOffsets the offsets of the rowgroups selected if loaded on the client
     */
    public ParquetInputSplit(
            Path file, long start, long end, long length, String[] hosts,
            long[] rowGroupOffsets) {
        super(file, start, length, hosts);
        this.end = end;
        this.rowGroupOffsets = rowGroupOffsets;
    }

    /**
     * @return the requested schema
     * @deprecated the file footer is no longer read before creating input splits
     */
    @Deprecated
    String getRequestedSchema() {
        throw new UnsupportedOperationException(
                "Splits no longer have the requested schema, see PARQUET-234");
    }

    /**
     * @return the file schema
     * @deprecated the file footer is no longer read before creating input splits
     */
    @Deprecated
    public String getFileSchema() {
        throw new UnsupportedOperationException(
                "Splits no longer have the file schema, see PARQUET-234");
    }

    /**
     * @return the end offset of that split
     */
    public long getEnd() {
        return end;
    }

    /**
     * @return app specific metadata from the file
     * @deprecated the file footer is no longer read before creating input splits
     */
    @Deprecated
    public Map<String, String> getExtraMetadata() {
        throw new UnsupportedOperationException(
                "Splits no longer have file metadata, see PARQUET-234");
    }

    /**
     * @return app specific metadata provided by the read support in the init phase
     */
    @Deprecated
    Map<String, String> getReadSupportMetadata() {
        throw new UnsupportedOperationException(
                "Splits no longer have read-support metadata, see PARQUET-234");
    }

    /**
     * @return the offsets of the row group selected if this has been determined on the client side
     */
    public long[] getRowGroupOffsets() {
        return rowGroupOffsets;
    }

    @Override
    public String toString() {
        String hosts;
        try {
            hosts = Arrays.toString(getLocations());
        } catch (Exception e) {
            // IOException/InterruptedException could be thrown
            hosts = "(" + e + ")";
        }

        return this.getClass().getSimpleName() + "{" +
                "part: " + getPath()
                + " start: " + getStart()
                + " end: " + getEnd()
                + " length: " + getLength()
                + " hosts: " + hosts
                + (rowGroupOffsets == null ? "" : (" row groups: " + Arrays.toString(rowGroupOffsets)))
                + "}";
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void readFields(DataInput hin) throws IOException {
        byte[] bytes = readArray(hin);
        DataInputStream in = new DataInputStream(new GZIPInputStream(new ByteArrayInputStream(bytes)));
        super.readFields(in);
        this.end = in.readLong();
        if (in.readBoolean()) {
            this.rowGroupOffsets = new long[in.readInt()];
            for (int i = 0; i < rowGroupOffsets.length; i++) {
                rowGroupOffsets[i] = in.readLong();
            }
        }
        in.close();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void write(DataOutput hout) throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        DataOutputStream out = new DataOutputStream(new GZIPOutputStream(baos));
        super.write(out);
        out.writeLong(end);
        out.writeBoolean(rowGroupOffsets != null);
        if (rowGroupOffsets != null) {
            out.writeInt(rowGroupOffsets.length);
            for (long o : rowGroupOffsets) {
                out.writeLong(o);
            }
        }
        out.close();
        writeArray(hout, baos.toByteArray());
    }

    private static void writeArray(DataOutput out, byte[] bytes) throws IOException {
        out.writeInt(bytes.length);
        out.write(bytes, 0, bytes.length);
    }

    private static byte[] readArray(DataInput in) throws IOException {
        int len = in.readInt();
        byte[] bytes = new byte[len];
        in.readFully(bytes);
        return bytes;
    }

}