/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.AcidInputFormat; import org.apache.hadoop.hive.ql.io.ColumnarSplit; import org.apache.hadoop.hive.ql.io.LlapAwareSplit; import org.apache.hadoop.hive.ql.io.SyntheticFileId; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileSplit; import org.apache.orc.OrcProto; import org.apache.orc.impl.OrcTail; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * OrcFileSplit. Holds file meta info * */ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit { private static final Logger LOG = LoggerFactory.getLogger(OrcSplit.class); private OrcTail orcTail; private boolean hasFooter; private boolean isOriginal; private boolean hasBase; private final List<AcidInputFormat.DeltaMetaData> deltas = new ArrayList<>(); private long projColsUncompressedSize; private transient Object fileKey; private long fileLen; static final int HAS_SYNTHETIC_FILEID_FLAG = 16; static final int HAS_LONG_FILEID_FLAG = 8; static final int BASE_FLAG = 4; static final int ORIGINAL_FLAG = 2; static final int FOOTER_FLAG = 1; protected OrcSplit() { //The FileSplit() constructor in hadoop 0.20 and 1.x is package private so can't use it. //This constructor is used to create the object and then call readFields() // so just pass nulls to this super constructor. super(null, 0, 0, (String[]) null); } public OrcSplit(Path path, Object fileId, long offset, long length, String[] hosts, OrcTail orcTail, boolean isOriginal, boolean hasBase, List<AcidInputFormat.DeltaMetaData> deltas, long projectedDataSize, long fileLen) { super(path, offset, length, hosts); // For HDFS, we could avoid serializing file ID and just replace the path with inode-based // path. However, that breaks bunch of stuff because Hive later looks up things by split path. this.fileKey = fileId; this.orcTail = orcTail; hasFooter = this.orcTail != null; this.isOriginal = isOriginal; this.hasBase = hasBase; this.deltas.addAll(deltas); this.projColsUncompressedSize = projectedDataSize <= 0 ? length : projectedDataSize; // setting file length to Long.MAX_VALUE will let orc reader read file length from file system this.fileLen = fileLen <= 0 ? Long.MAX_VALUE : fileLen; } @Override public void write(DataOutput out) throws IOException { ByteArrayOutputStream bos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(bos); // serialize path, offset, length using FileSplit super.write(dos); int required = bos.size(); // write addition payload required for orc writeAdditionalPayload(dos); int additional = bos.size() - required; out.write(bos.toByteArray()); if (LOG.isTraceEnabled()) { LOG.trace("Writing additional {} bytes to OrcSplit as payload. Required {} bytes.", additional, required); } } private void writeAdditionalPayload(final DataOutputStream out) throws IOException { boolean isFileIdLong = fileKey instanceof Long, isFileIdWritable = fileKey instanceof Writable; int flags = (hasBase ? BASE_FLAG : 0) | (isOriginal ? ORIGINAL_FLAG : 0) | (hasFooter ? FOOTER_FLAG : 0) | (isFileIdLong ? HAS_LONG_FILEID_FLAG : 0) | (isFileIdWritable ? HAS_SYNTHETIC_FILEID_FLAG : 0); out.writeByte(flags); out.writeInt(deltas.size()); for(AcidInputFormat.DeltaMetaData delta: deltas) { delta.write(out); } if (hasFooter) { OrcProto.FileTail fileTail = orcTail.getMinimalFileTail(); byte[] tailBuffer = fileTail.toByteArray(); int tailLen = tailBuffer.length; WritableUtils.writeVInt(out, tailLen); out.write(tailBuffer); } if (isFileIdLong) { out.writeLong(((Long)fileKey).longValue()); } else if (isFileIdWritable) { ((Writable)fileKey).write(out); } out.writeLong(fileLen); } @Override public void readFields(DataInput in) throws IOException { //deserialize path, offset, length using FileSplit super.readFields(in); byte flags = in.readByte(); hasFooter = (FOOTER_FLAG & flags) != 0; isOriginal = (ORIGINAL_FLAG & flags) != 0; hasBase = (BASE_FLAG & flags) != 0; boolean hasLongFileId = (HAS_LONG_FILEID_FLAG & flags) != 0, hasWritableFileId = (HAS_SYNTHETIC_FILEID_FLAG & flags) != 0; if (hasLongFileId && hasWritableFileId) { throw new IOException("Invalid split - both file ID types present"); } deltas.clear(); int numDeltas = in.readInt(); for(int i=0; i < numDeltas; i++) { AcidInputFormat.DeltaMetaData dmd = new AcidInputFormat.DeltaMetaData(); dmd.readFields(in); deltas.add(dmd); } if (hasFooter) { int tailLen = WritableUtils.readVInt(in); byte[] tailBuffer = new byte[tailLen]; in.readFully(tailBuffer); OrcProto.FileTail fileTail = OrcProto.FileTail.parseFrom(tailBuffer); orcTail = new OrcTail(fileTail, null); } if (hasLongFileId) { fileKey = in.readLong(); } else if (hasWritableFileId) { SyntheticFileId fileId = new SyntheticFileId(); fileId.readFields(in); this.fileKey = fileId; } fileLen = in.readLong(); } public OrcTail getOrcTail() { return orcTail; } public boolean hasFooter() { return hasFooter; } public boolean isOriginal() { return isOriginal; } public boolean hasBase() { return hasBase; } public List<AcidInputFormat.DeltaMetaData> getDeltas() { return deltas; } public long getFileLength() { return fileLen; } /** * If this method returns true, then for sure it is ACID. * However, if it returns false.. it could be ACID or non-ACID. * @return */ public boolean isAcid() { return hasBase || deltas.size() > 0; } public long getProjectedColumnsUncompressedSize() { return projColsUncompressedSize; } public Object getFileKey() { return fileKey; } @Override public long getColumnarProjectionSize() { return projColsUncompressedSize; } @Override public boolean canUseLlapIo() { return isOriginal && (deltas == null || deltas.isEmpty()); } @Override public String toString() { return "OrcSplit [" + getPath() + ", start=" + getStart() + ", length=" + getLength() + ", isOriginal=" + isOriginal + ", fileLength=" + fileLen + ", hasFooter=" + hasFooter + ", hasBase=" + hasBase + ", deltas=" + (deltas == null ? 0 : deltas.size()) + "]"; } }