/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.hadoop.zebra.io; import java.io.Closeable; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.EOFException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.permission.*; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.zebra.tfile.TFile; import org.apache.hadoop.zebra.tfile.Utils; import org.apache.hadoop.zebra.tfile.ByteArray; import org.apache.hadoop.zebra.tfile.RawComparable; import org.apache.hadoop.zebra.types.CGSchema; import org.apache.hadoop.zebra.parser.ParseException; import org.apache.hadoop.zebra.types.Partition; import org.apache.hadoop.zebra.types.Projection; import org.apache.hadoop.zebra.schema.Schema; import org.apache.hadoop.zebra.types.TypesUtils; import org.apache.hadoop.zebra.types.TypesUtils.TupleReader; import org.apache.hadoop.zebra.types.TypesUtils.TupleWriter; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; /** * ColumnGroup is the basic unit of a persistent table. The following * Configuration parameters can customize the behavior of ColumnGroup. * <ul> * <li><b>table.output.tfile.minBlock.size</b> (int) Minimum compression block * size for underlying TFile (default to 1024*1024). * <li><b>table.output.tfile.compression</b> (String) Compression method (one * of "none", "lzo", "gz") (default to "lzo"). * * @see {@link TFile#getSupportedCompressionAlgorithms()} * <li><b>table.input.split.minSize</b> (int) Minimum split size (default * to 64*1024). * </ul> */ class ColumnGroup { static Log LOG = LogFactory.getLog(ColumnGroup.class); private final static String CONF_COMPRESS = "table.output.tfile.compression"; private final static String DEFAULT_COMPRESS = "gz"; private final static String CONF_MIN_BLOCK_SIZE = "table.tfile.minblock.size"; private final static int DEFAULT_MIN_BLOCK_SIZE = 1024 * 1024; private final static String CONF_MIN_SPLIT_SIZE = "table.input.split.minSize"; private final static int DEFAULT_MIN_SPLIT_SIZE = 64 * 1024; static final double SPLIT_SLOP = 1.1; // 10% slop // excluding files start with the following prefix, may change to regex private final static String CONF_NON_DATAFILE_PREFIX = "table.cg.nondatafile.prefix"; private final static String SPECIAL_FILE_PREFIX = "."; // tmp schema file name, used as a flag of unfinished CG private final static String SCHEMA_FILE = ".schema"; // meta data TFile for entire CG, used as a flag of closed CG final static String META_FILE = ".meta"; // sorted table key ranges for default sorted table split generations private final static String KEY_RANGE_FOR_DEFAULT_SORTED_SPLIT = ".keyrange"; static final String BLOCK_NAME_INDEX = "ColumnGroup.index"; static Path makeMetaFilePath(Path parent) { return new Path(parent, META_FILE); } static String getCompression(Configuration conf) { return conf.get(CONF_COMPRESS, DEFAULT_COMPRESS); } static int getMinBlockSize(Configuration conf) { return conf.getInt(CONF_MIN_BLOCK_SIZE, DEFAULT_MIN_BLOCK_SIZE); } static String getNonDataFilePrefix(Configuration conf) { return conf.get(CONF_NON_DATAFILE_PREFIX, SPECIAL_FILE_PREFIX); } static int getMinSplitSize(Configuration conf) { return conf.getInt(CONF_MIN_SPLIT_SIZE, DEFAULT_MIN_SPLIT_SIZE); } /** * Drop a Column Group, maps to deleting all the files relating to this Column * Group on the FileSystem. * * @param path * the path to the ColumnGroup. * @param conf * The configuration object. */ public static void drop(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); fs.delete(path, true); // TODO: // fs.close(); } /** * Scan the file system, looking for TFiles, and build an in-memory index of a * column group. * * @param fs * The file system * @param path * The base path of the column group. * @param dirty * Whether to build dirty index or not. Dirty index is built by only * looking at file-level status and not opening up individual TFiles. * The flag may only be set for unsorted ColumnGroups. * @param conf * The configuration object. * @return The in-memory index object. * @throws IOException */ static CGIndex buildIndex(FileSystem fs, Path path, boolean dirty, Configuration conf) throws IOException { CGIndex ret = new CGIndex(); CGPathFilter cgPathFilter = new CGPathFilter(); CGPathFilter.setConf(conf); FileStatus[] files = fs.listStatus(path, cgPathFilter); Comparator<RawComparable> comparator = null; for (FileStatus f : files) { if (dirty) { ret.add(f.getLen(), f.getPath().getName()); } else { FSDataInputStream dis = null; TFile.Reader tr = null; try { dis = fs.open(f.getPath()); tr = new TFile.Reader(dis, f.getLen(), conf); if (comparator == null) { comparator = tr.getComparator(); } if (tr.getEntryCount() > 0) { CGIndexEntry range = new CGIndexEntry(f.getPath().getName(), tr.getEntryCount(), tr .getFirstKey(), tr.getLastKey()); ret.add(f.getLen(), tr.getEntryCount(), range); } } catch (IOException e) { // TODO: log the error, ignore incorrect TFiles. e.printStackTrace(System.err); } finally { if (tr != null) { tr.close(); } if (dis != null) { dis.close(); } } } } ret.sort(comparator); int idx = 0; for (CGIndexEntry e : ret.getIndex()) { e.setIndex(idx++); } return ret; } /** * ColumnGroup reader. */ public static class Reader implements Closeable { Path path; Configuration conf; FileSystem fs; CGSchema cgschema; Comparator<RawComparable> comparator; Projection projection; CGIndex cgindex; ArrayList<SplitColumn> exec; SplitColumn top; // directly associated with logical schema SplitColumn leaf; // corresponding to projection boolean closed; boolean dirty; /** * Get the Column Group physical schema without loading the full CG index. * * @param path * The path to the ColumnGroup. * @param conf * The configuration object. * @return The ColumnGroup schema. * @throws IOException */ public static Schema getSchema(Path path, Configuration conf) throws IOException, ParseException { FileSystem fs = path.getFileSystem(conf); CGSchema cgschema = CGSchema.load(fs, path); return cgschema.getSchema(); } /** * Create a ColumnGroup reader. * * @param path * The directory path to the column group. * @param conf * Optional configuration parameters. * @throws IOException */ public Reader(Path path, Configuration conf) throws IOException, ParseException { this(path, conf, false); } public Reader(Path path, Configuration conf, boolean mapper) throws IOException, ParseException { this(path, true, conf, mapper); } Reader(Path path, boolean dirty, Configuration conf) throws IOException, ParseException { this(path, dirty, conf, false); } Reader(Path path, boolean dirty, Configuration conf, boolean mapper) throws IOException, ParseException { this.path = path; this.conf = conf; this.dirty = dirty; fs = path.getFileSystem(conf); // check existence of path if (!fs.exists(path)) { throw new IOException("Path doesn't exist: " + path); } if (!mapper && !fs.getFileStatus(path).isDir()) { throw new IOException("Path exists but not a directory: " + path); } cgschema = CGSchema.load(fs, path); if (cgschema.isSorted()) { comparator = TFile.makeComparator(cgschema.getComparator()); } projection = new Projection(cgschema.getSchema()); // default projection to CG schema. Path metaFilePath = makeMetaFilePath(path); /* If index file is not existing */ if (!fs.exists(metaFilePath)) { throw new FileNotFoundException( "Missing Meta File of " + metaFilePath); } else if (cgschema.isSorted()) { MetaFile.Reader metaFile = MetaFile.createReader(metaFilePath, conf); try { cgindex = new CGIndex(); DataInputStream dis = metaFile.getMetaBlock(BLOCK_NAME_INDEX); try { cgindex.readFields(dis); } catch (IOException e) { throw new IOException("Index file read failure :"+ e.getMessage()); } finally { dis.close(); } } finally { metaFile.close(); } } } /** * Set the projection for the reader. This will affect calls to * getScanner(), getStatus(), and getColumnNames(). * * @param projection * The projection on the column group for subsequent read * operations. If we want select all columns, pass * projection==null. */ public synchronized void setProjection(String projection) throws ParseException { if (projection == null) { this.projection = new Projection(cgschema.getSchema()); } else { this.projection = new Projection(cgschema.getSchema(), projection); } } /** * Get the schema of columns of the table (possibly through projection). * * @return Schema of the columns of the table (possibly through projection). */ public Schema getSchema() throws ParseException { return projection.getSchema(); } /** * Get the projection * @return Projection of this Reader */ public Projection getProjection() { return projection; } public String getName() { return cgschema.getName(); } public String getSerializer() { return cgschema.getSerializer(); } public String getCompressor() { return cgschema.getCompressor(); } public CGSchema getCGSchema() { return cgschema; } public String getGroup() { return cgschema.getGroup(); } public short getPerm() { return cgschema.getPerm(); } /** * Get a scanner that reads all rows whose row keys fall in a specific * range. * * @param beginKey * The begin key of the scan range. * @param endKey * The end key of the scan range. * @param closeReader * close the underlying Reader object when we close the scanner. * Should be set to true if we have only one scanner on top of the * reader, so that we should release resources after the scanner is * closed. * @return A scanner object. * @throws IOException */ public synchronized CGScanner getScanner(BytesWritable beginKey, BytesWritable endKey, boolean closeReader) throws IOException, ParseException { if (closed) { throw new EOFException("Reader already closed"); } if (!isSorted()) { throw new IOException( "Cannot get key-bounded scanner for unsorted table"); } RawComparable begin = (beginKey != null) ? new ByteArray(beginKey.getBytes(), 0, beginKey .getLength()) : null; RawComparable end = (endKey != null) ? new ByteArray(endKey.getBytes(), 0, endKey.getLength()) : null; if (begin != null && end != null) { if (comparator.compare(begin, end) >= 0) { throw new IOException("Zero-key-range split"); } } return new CGScanner(begin, end, closeReader); } /** * Get a scanner that reads a consecutive number of rows as defined in the * CGRangeSplit object, which should be obtained from previous calls of * rangeSplit(). * * @param split * The split range. If null, get a scanner to read the complete * column group. * @param closeReader * close the underlying Reader object when we close the scanner. * Should be set to true if we have only one scanner on top of the * reader, so that we should release resources after the scanner is * closed. * @return A scanner object. * @throws IOException */ public synchronized CGScanner getScanner(CGRangeSplit split, boolean closeReader) throws IOException, ParseException { if (closed) { throw new EOFException("Reader already closed"); } if (split == null) { if (cgindex == null) cgindex = buildIndex(fs, path, dirty, conf); return getScanner(new CGRangeSplit(0, cgindex.size()), closeReader); } if (split.len < 0) { throw new IllegalArgumentException("Illegal range split"); } return new CGScanner(split, closeReader); } /** * Get a scanner that reads the rows defined by rowRange. * * @param closeReader * close the underlying Reader object when we close the scanner. * Should be set to true if we have only one scanner on top of the * reader, so that we should release resources after the scanner is * closed. * @param rowSplit specifies part index, start row, and end row. * @return A scanner object. */ public synchronized CGScanner getScanner(boolean closeReader, CGRowSplit rowSplit) throws IOException, ParseException { if (closed) { throw new EOFException("Reader already closed"); } return new CGScanner(rowSplit, closeReader); } /** * Given a split range, calculate how the file data that fall into the range * are distributed among hosts. * * @param split * The range-based split. If null, return all blocks. * @return a map from host name to the amount of data (in bytes) the host * owns that fall roughly into the key range. */ public BlockDistribution getBlockDistribution(CGRangeSplit split) throws IOException { if (split == null) { return getBlockDistribution(new CGRangeSplit(0, cgindex.size())); } if (cgindex == null) cgindex = buildIndex(fs, path, dirty, conf); if ((split.start | split.len | (cgindex.size() - split.start - split.len)) < 0) { throw new IndexOutOfBoundsException("Bad split"); } BlockDistribution ret = new BlockDistribution(); for (int i = split.start; i < split.start + split.len; ++i) { CGIndexEntry dfkr = cgindex.get(i); Path tfilePath = new Path(path, dfkr.getName()); FileStatus tfileStatus = fs.getFileStatus(tfilePath); BlockLocation[] locations = fs.getFileBlockLocations(tfileStatus, 0, tfileStatus.getLen()); for (BlockLocation l : locations) { ret.add(l); } } return ret; } /** * Given a row range, calculate how the file data that fall into the range * are distributed among hosts. * * @param split * The row-based split. If null, return all blocks. * @return a map from host name to the amount of data (in bytes) the host * owns that fall roughly into the key range. */ public BlockDistribution getBlockDistribution(CGRowSplit split) throws IOException { if (split == null) { throw new IOException("Row-based split cannot be null for getBlockDistribution()"); } BlockDistribution ret = new BlockDistribution(); for (int i = 0; i < split.length; i++) { FileStatus tfileStatus = fs.getFileStatus(new Path(path, split.names[i])); BlockLocation[] locations = null; long len = 0; if (i == 0) { if (split.startByteFirst != -1) { len = split.numBytesFirst; locations = fs.getFileBlockLocations(tfileStatus, split.startByteFirst, len); } } else if (i == split.length - 1) { if (split.numBytesLast != -1) { len = split.numBytesLast; locations = fs.getFileBlockLocations(tfileStatus, 0, len); } } if (locations == null) { len = tfileStatus.getLen(); locations = fs.getFileBlockLocations(tfileStatus, 0, len); } for (BlockLocation l : locations) { ret.add(l); } } return ret; } private int getStartBlockIndex(long[] startOffsets, long offset) { int index = Arrays.binarySearch(startOffsets, offset); if (index < 0) index = -index - 2; return index; } private int getEndBlockIndex(long[] startOffsets, long offset) { int index = Arrays.binarySearch(startOffsets, offset); if (index < 0) index = -index - 1; return index; } /** * Sets startRow and number of rows in rowSplit based on * startOffset and length. * * It is assumed that 'startByte' and 'numBytes' in rowSplit itself * are not valid. */ void fillRowSplit(CGRowSplit rowSplit, CGRowSplit src) throws IOException { if (src.names == null || src.length == 0) return; boolean noSizeInIndex = false; long[] sizes = rowSplit.sizes; if (sizes == null) { /* the on disk table is sorted. Later this will be made unnecessary when * CGIndexEntry serializes its bytes field and the meta file versioning is * supported. */ noSizeInIndex = true; } rowSplit.names = src.names; rowSplit.length = src.length; rowSplit.startByteFirst = src.startByteFirst; rowSplit.numBytesFirst = src.numBytesFirst; rowSplit.numBytesLast = src.numBytesLast; Path firstPath = null, lastPath; TFile.Reader reader = null; if (src.startByteFirst != -1) { firstPath = new Path(path, rowSplit.names[0]); long size; if (noSizeInIndex) { FileStatus tfile = fs.getFileStatus(firstPath); size = tfile.getLen(); } else size = sizes[0]; reader = new TFile.Reader(fs.open(firstPath), size, conf); try { long startRow = reader.getRecordNumNear(src.startByteFirst); long endRow = reader.getRecordNumNear(src.startByteFirst + src.numBytesFirst); if (endRow < startRow) endRow = startRow; rowSplit.startRowFirst = startRow; rowSplit.numRowsFirst = endRow - startRow; } catch (IOException e) { reader.close(); throw e; } } if (src.numBytesLast != -1 && rowSplit.length > 1) { lastPath = new Path(path, rowSplit.names[rowSplit.length - 1]); if (reader == null || !firstPath.equals(lastPath)) { if (reader != null) reader.close(); long size; if (noSizeInIndex) { FileStatus tfile = fs.getFileStatus(lastPath); size = tfile.getLen(); } else size = sizes[rowSplit.length - 1]; reader = new TFile.Reader(fs.open(lastPath), size, conf); } try { long endRow = reader.getRecordNumNear(src.numBytesLast); rowSplit.numRowsLast = endRow; } catch (IOException e) { reader.close(); throw e; } } if (reader != null) reader.close(); } /** * Get a sampling of keys and calculate how data are distributed among * key-partitioned buckets. The implementation attempts to calculate all * information in one shot to avoid reading TFile index multiple times. * Special care is also taken that memory requirement is not linear to the * size of total data set for the column group. * * @param n * Targeted size of the sampling. * @param nTables * Number of tables in a union * @return KeyDistribution object. * @throws IOException */ public KeyDistribution getKeyDistribution(int n, int nTables, BlockDistribution lastBd) throws IOException { // TODO: any need for similar capability for unsorted for sorted CGs? if (!isSorted()) { throw new IOException("Cannot get key distribution for unsorted table"); } KeyDistribution ret = new KeyDistribution(comparator); if (n < 0) { /* Path keyRangeFile = new Path(path, KEY_RANGE_FOR_DEFAULT_SORTED_SPLIT); if (fs.exists(keyRangeFile)) { try { FSDataInputStream ins = fs.open(keyRangeFile); long minStepSize = ins.readLong(); int size = ins.readInt(); for (int i = 0; i < size; i++) { BytesWritable keyIn = new BytesWritable(); keyIn.readFields(ins); ByteArray key = new ByteArray(keyIn.getBytes()); ret.add(key); } ret.setMinStepSize(minStepSize); return ret; } catch (Exception e) { // no-op } } */ n = 1; } Path[] paths = new Path[cgindex.size()]; FileStatus[] tfileStatus = new FileStatus[paths.length]; long totalBytes = 0; for (int i = 0; i < paths.length; ++i) { paths[i] = cgindex.getPath(i, path); tfileStatus[i] = fs.getFileStatus(paths[i]); totalBytes += tfileStatus[i].getLen(); } final long minSize = getMinSplitSize(conf); final long EPSILON = (long) (minSize * (SPLIT_SLOP - 1)); long goalSize = totalBytes / n; long batchSize = 0; BlockDistribution bd = new BlockDistribution();; RawComparable prevKey = null; long minStepSize = -1; FSDataInputStream nextFsdis = null; TFile.Reader nextReader = null; for (int i = 0; i < paths.length; ++i) { FileStatus fstatus = tfileStatus[i]; long blkSize = fstatus.getBlockSize(); long fileLen = fstatus.getLen(); long stepSize = Math.max(minSize, (goalSize < blkSize) ? goalSize : blkSize); if (minStepSize== -1 || minStepSize > stepSize) minStepSize = stepSize; // adjust the block size by the scaling factor blkSize /= nTables; stepSize = Math.max(minSize, (goalSize < blkSize) ? goalSize : blkSize); FSDataInputStream fsdis = null; TFile.Reader reader = null; long remainLen = fileLen; try { if (nextReader == null) { fsdis = fs.open(paths[i]); reader = new TFile.Reader(fsdis, fileLen, conf); } else { fsdis = nextFsdis; reader = nextReader; } BlockLocation[] locations = fs.getFileBlockLocations(fstatus, 0, fileLen); if (locations.length == 0) { throw new AssertionError( "getFileBlockLocations returns 0 location"); } Arrays.sort(locations, new Comparator<BlockLocation>() { @Override public int compare(BlockLocation o1, BlockLocation o2) { long diff = o1.getOffset() - o2.getOffset(); if (diff < 0) return -1; if (diff > 0) return 1; return 0; } }); long[] startOffsets = new long[locations.length]; for (int ii = 0; ii < locations.length; ii++) startOffsets[ii] = locations[ii].getOffset(); boolean done = false; while ((remainLen > 0) && !done) { long splitBytes = remainLen > stepSize ? stepSize : remainLen; long offsetBegin = fileLen - remainLen; long offsetEnd = offsetBegin + splitBytes; int indexBegin = getStartBlockIndex(startOffsets, offsetBegin); int indexEnd = getEndBlockIndex(startOffsets, offsetEnd); BlockLocation firstBlock = locations[indexBegin]; BlockLocation lastBlock = locations[indexEnd-1]; long lastBlockOffsetBegin = lastBlock.getOffset(); long lastBlockOffsetEnd = lastBlockOffsetBegin + lastBlock.getLength(); if ((firstBlock.getOffset() > offsetBegin) || (lastBlockOffsetEnd < offsetEnd)) { throw new AssertionError( "Block locations returned by getFileBlockLocations do not cover requested range"); } // Adjust offsets if ((offsetEnd > lastBlockOffsetBegin) && (offsetEnd - lastBlockOffsetBegin < EPSILON)) { // the split includes a bit of the next block, remove it. if (offsetEnd != fileLen) { // only if this is not the last chunk offsetEnd = lastBlockOffsetBegin; splitBytes = offsetEnd - offsetBegin; indexEnd--; } } else if ((lastBlockOffsetEnd > offsetEnd) && (lastBlockOffsetEnd - offsetEnd < EPSILON)) { // the split includes almost the whole block, fill it. offsetEnd = lastBlockOffsetEnd; splitBytes = offsetEnd - offsetBegin; } RawComparable key = reader.getKeyNear(offsetEnd); if (key == null) { offsetEnd = fileLen; splitBytes = offsetEnd - offsetBegin; if (i < paths.length-1) { nextFsdis = fs.open(paths[i+1]); nextReader = new TFile.Reader(nextFsdis, tfileStatus[i+1].getLen(), conf); key = nextReader.getFirstKey(); } done = true; // TFile index too large? Is it necessary now? } remainLen -= splitBytes; batchSize += splitBytes; if (key != null && batchSize >= stepSize) { if (batchSize - splitBytes < EPSILON || splitBytes < EPSILON) { // the last chunk or this chunk is small enough to create a new range for this key setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key); ret.add(key, bd); batchSize = 0; bd = new BlockDistribution(); } else { ret.add(prevKey, bd); batchSize = splitBytes; bd = new BlockDistribution(); if (batchSize >= stepSize) { setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key); ret.add(key, bd); batchSize = 0; bd = new BlockDistribution(); } else { setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key); } } } else { setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key); } prevKey = key; } } finally { if (reader != null) { try { reader.close(); } catch (Exception e) { // no-op; } } if (fsdis != null) { try { fsdis.close(); } catch (Exception e) { // no-op } } } } if (lastBd != null) lastBd.add(bd); ret.setMinStepSize(minStepSize); return ret; } private void setBlockDistribution(BlockDistribution bd, TFile.Reader reader, BlockLocation[] locations, FileStatus fileStatus, long[] startOffsets, RawComparable begin, RawComparable end) throws IOException { long beginOffset, endOffset = -1; if (begin == null) beginOffset = 0; else beginOffset = reader.getOffsetForKey(begin); if (end != null) { if (begin == null) begin = reader.getFirstKey(); /* Only if the key range is empty. This is needed because TFile has a 16-byte * Magic that causes getOffsetForKey to return 16 (not 0) even on the first key. */ if (comparator.compare(begin, end) != 0) endOffset = reader.getOffsetForKey(end); } int startBlockIndex = (beginOffset == 0 ? 0 : getStartBlockIndex(startOffsets, beginOffset)); BlockLocation l; int endBlockIndex = (end == null ? locations.length : endOffset == -1 ? startBlockIndex : getEndBlockIndex(startOffsets, endOffset)); for (int ii = startBlockIndex; ii < endBlockIndex; ii++) { l = locations[ii]; long blkBeginOffset = l.getOffset(); long blkEndOffset = blkBeginOffset + l.getLength(); if (blkEndOffset > blkBeginOffset) { bd.add(l, blkEndOffset - blkBeginOffset); } } return; } /** * Get the status of the ColumnGroup. */ public BasicTableStatus getStatus() throws IOException { if (cgindex == null) cgindex = buildIndex(fs, path, dirty, conf); return cgindex.status; } /** * Split the ColumnGroup by file orders. * * @param n * Targeted number of partitions. * @return A list of range-based splits, whose size may be less than or * equal to n. */ public List<CGRangeSplit> rangeSplit(int n) throws IOException { // The output of this method must be only dependent on the cgindex and // input parameter n - so that horizontally stitched column groups will // get aligned splits. if (cgindex == null) cgindex = buildIndex(fs, path, dirty, conf); int numFiles = cgindex.size(); if ((numFiles < n) || (n < 0)) { return rangeSplit(numFiles); } List<CGRangeSplit> lst = new ArrayList<CGRangeSplit>(); int beginIndex = 0; for (int i = 0; i < n; ++i) { int endIndex = (int) ((long) (i + 1) * numFiles / n); lst.add(new CGRangeSplit(beginIndex, endIndex - beginIndex)); beginIndex = endIndex; } return lst; } /** * We already use FileInputFormat to create byte offset-based input splits. * Their information is encoded in starts, lengths and paths. This method is * to wrap this information to form CGRowSplit objects at column group level. * * @param starts array of starting byte of fileSplits. * @param lengths array of length of fileSplits. * @param paths array of path of fileSplits. * @return A list of CGRowSplit objects. * */ public List<CGRowSplit> rowSplit(long[] starts, long[] lengths, Path[] paths, int[] batches, int numBatches) throws IOException { List<CGRowSplit> lst = new ArrayList<CGRowSplit>(); CGRowSplit cgRowSplit; long startFirst, bytesFirst, bytesLast; int length; if (numBatches == 0) { cgRowSplit = new CGRowSplit(null, null, 0, -1, 0, 0); lst.add(cgRowSplit); return lst; } if (cgindex == null) cgindex = buildIndex(fs, this.path, dirty, conf); if (cgindex.size() == 0) { cgRowSplit = new CGRowSplit(null, null, 0, -1, 0, 0); lst.add(cgRowSplit); return lst; } for (int i=0; i< numBatches; i++) { int indexFirst = batches[i]; int indexLast = batches[i+1] - 1; startFirst = starts[indexFirst]; bytesFirst = lengths[indexFirst]; bytesLast = lengths[indexLast]; length = batches[i+1] - batches[i]; String[] namesInSplit = new String[length]; long[] sizesInSplit = new long[length]; for (int j = 0; j < length; j++) { namesInSplit[j] = paths[indexFirst+j].getName(); sizesInSplit[j] = cgindex.get(cgindex.getFileIndex(paths[indexFirst+j])).bytes; } cgRowSplit = new CGRowSplit(namesInSplit, sizesInSplit, length, startFirst, bytesFirst, bytesLast); lst.add(cgRowSplit); } return lst; } void rearrangeFileIndices(FileStatus[] fileStatus) throws IOException { int size = fileStatus.length; FileStatus[] result = new FileStatus[size]; if (cgindex == null) cgindex = buildIndex(fs, path, dirty, conf); if (size < cgindex.size()) throw new AssertionError("Incorrect file list size"); for (int j, i = 0; i < size; i++) { j = cgindex.getFileIndex(fileStatus[i].getPath()); if (j != -1) result[j] = fileStatus[i]; } for (int i = 0; i < size; i++) fileStatus[i] = result[i]; } /** * Is the ColumnGroup sorted? * * @return Whether the ColumnGroup is sorted. */ public boolean isSorted() { return cgschema.isSorted(); } @Override public void close() throws IOException { if (!closed) { closed = true; } } /** * A simple wrapper class over TFile.Reader.Scanner to simplify the creation * and resource management. */ static class TFileScanner implements Closeable { boolean closed = true; FSDataInputStream ins; TFile.Reader reader; TFile.Reader.Scanner scanner; TupleReader tupleReader; TFileScanner(FileSystem fs, Path path, CGRowSplit rowRange, RawComparable begin, RawComparable end, boolean first, boolean last, CGSchema cgschema, Projection projection, Configuration conf) throws IOException, ParseException { try { ins = fs.open(path); /* * compressor is inside cgschema */ reader = new TFile.Reader(ins, fs.getFileStatus(path).getLen(), conf); if (rowRange != null && rowRange.startByteFirst != -1) { if (first && rowRange.startByteFirst != -1) scanner = reader.createScannerByRecordNum(rowRange.startRowFirst, rowRange.startRowFirst + rowRange.numRowsFirst); else if (last && rowRange.numBytesLast != -1) scanner = reader.createScannerByRecordNum(0, rowRange.numRowsLast); else scanner = reader.createScanner(); } else { /* TODO: more investigation is needed for the following. * using deprecated API just so that zebra can work with * hadoop jar that does not contain HADOOP-6218 (Record ids for * TFile). This is expected to be temporary. Later we should * use the undeprecated API. */ scanner = reader.createScanner(begin, end); } /* * serializer is inside cgschema: different serializer will require * different Reader: for pig, it's TupleReader */ tupleReader = new TupleReader(cgschema.getSchema(), projection); closed = false; } finally { if (closed == true) { // failed to instantiate the object. if (scanner != null) { try { scanner.close(); } catch (Exception e) { // no-op } } if (reader != null) { try { reader.close(); } catch (Exception e) { // no op } } if (ins != null) { try { ins.close(); } catch (Exception e) { // no op } } } } } void rewind() throws IOException { scanner.rewind(); } void getKey(BytesWritable key) throws IOException { scanner.entry().getKey(key); } void getValue(Tuple val) throws IOException, ParseException { DataInputStream dis = scanner.entry().getValueStream(); try { tupleReader.get(dis, val); } finally { dis.close(); } } boolean seekTo(BytesWritable key) throws IOException { return scanner.seekTo(key.getBytes(), 0, key.getLength()); } boolean advance() throws IOException { return scanner.advance(); } boolean atEnd() { return scanner.atEnd(); } void seekToEnd() throws IOException { scanner.seekToEnd(); } @Override public void close() throws IOException { if (!closed) { closed = true; try { scanner.close(); } catch (Exception e) { // no op } try { reader.close(); } catch (Exception e) { // no op } try { ins.close(); } catch (Exception e) { // no op } } } } /** * ColumnGroup scanner */ class CGScanner implements TableScanner { private Projection logicalSchema = null; private TFileScannerInfo[] scanners; private boolean closeReader; private int beginIndex, endIndex; private int current; // current scanner private boolean scannerClosed = true; private CGRowSplit rowRange; private TFileScanner scanner; private class TFileScannerInfo { boolean first, last; Path path; RawComparable begin, end; TFileScannerInfo(boolean first, boolean last, Path path, RawComparable begin, RawComparable end) { this.first = first; this.last = last; this.begin = begin; this.end = end; this.path = path; } TFileScanner getTFileScanner() throws IOException { try { return new TFileScanner(fs, path, rowRange, begin, end, first, last, cgschema, logicalSchema, conf); } catch (ParseException e) { throw new IOException(e.getMessage()); } } } CGScanner(CGRangeSplit split, boolean closeReader) throws IOException, ParseException { if (cgindex== null) cgindex = buildIndex(fs, path, dirty, conf); if (split == null) { beginIndex = 0; endIndex = cgindex.size(); } else { beginIndex = split.start; endIndex = split.start + split.len; } init(null, null, null, closeReader); } /** * Scanner for a range specified by the given row range. * * @param rowRange see {@link CGRowSplit} * @param closeReader */ CGScanner(CGRowSplit rowRange, boolean closeReader) throws IOException, ParseException { beginIndex = 0; endIndex = rowRange.length; init(rowRange, null, null, closeReader); } CGScanner(RawComparable beginKey, RawComparable endKey, boolean closeReader) throws IOException, ParseException { beginIndex = 0; endIndex = cgindex.size(); if (beginKey != null) { beginIndex = cgindex.lowerBound(beginKey, comparator); } if (endKey != null) { endIndex = cgindex.lowerBound(endKey, comparator); if (endIndex < cgindex.size()) { ++endIndex; } } init(null, beginKey, endKey, closeReader); } private void init(CGRowSplit rowRange, RawComparable beginKey, RawComparable endKey, boolean doClose) throws IOException, ParseException { this.rowRange = rowRange; if (beginIndex > endIndex) { throw new IllegalArgumentException("beginIndex > endIndex"); } logicalSchema = ColumnGroup.Reader.this.getProjection(); List<TFileScannerInfo> tmpScanners = new ArrayList<TFileScannerInfo>(endIndex - beginIndex); try { boolean first, last, realFirst = true; Path myPath; for (int i = beginIndex; i < endIndex; ++i) { first = (i == beginIndex); last = (i == endIndex -1); RawComparable begin = first ? beginKey : null; RawComparable end = last ? endKey : null; TFileScannerInfo scanner; if (rowRange == null) myPath = cgindex.getPath(i, path); else myPath = new Path(path, rowRange.names[i]); scanner = new TFileScannerInfo(first, last, myPath, begin, end); if (realFirst) { this.scanner = scanner.getTFileScanner(); if (this.scanner.atEnd()) { this.scanner.close(); this.scanner = null; } else { realFirst = false; tmpScanners.add(scanner); } } else { TFileScanner myScanner = scanner.getTFileScanner(); if (!myScanner.atEnd()) tmpScanners.add(scanner); myScanner.close(); } } scanners = tmpScanners.toArray(new TFileScannerInfo[tmpScanners.size()]); this.closeReader = doClose; scannerClosed = false; } finally { if (scannerClosed) { // failed to initialize the object. if (scanner != null) scanner.close(); } } } @Override public void getKey(BytesWritable key) throws IOException { if (atEnd()) { throw new EOFException("No more key-value to read"); } scanner.getKey(key); } @Override public void getValue(Tuple row) throws IOException { if (atEnd()) { throw new EOFException("No more key-value to read"); } try { scanner.getValue(row); } catch (ParseException e) { throw new IOException("Invalid Projection: "+e.getMessage()); } } public void getCGKey(BytesWritable key) throws IOException { scanner.getKey(key); } public void getCGValue(Tuple row) throws IOException { try { scanner.getValue(row); } catch (ParseException e) { throw new IOException("Invalid Projection: "+e.getMessage()); } } @Override public String getProjection() { return logicalSchema.toString(); } public Schema getSchema() { return logicalSchema.getSchema(); } @Override public boolean advance() throws IOException { if (atEnd()) { return false; } scanner.advance(); while (true) { if (scanner.atEnd()) { scanner.close(); scanner = null; ++current; if (!atEnd()) { scanner = scanners[current].getTFileScanner(); } else return false; } else return true; } } public boolean advanceCG() throws IOException { scanner.advance(); while (true) { if (scanner.atEnd()) { scanner.close(); scanner = null; ++current; if (!atEnd()) { scanner = scanners[current].getTFileScanner(); } else return false; } else return true; } } @Override public boolean atEnd() throws IOException { return (current >= scanners.length); } @Override public boolean seekTo(BytesWritable key) throws IOException { if (!isSorted()) { throw new IOException("Cannot seek in unsorted Column Gruop"); } if (atEnd()) { return false; } int index = cgindex.lowerBound(new ByteArray(key.getBytes(), 0, key.getLength()), comparator); if (index >= endIndex) { seekToEnd(); return false; } if ((index < beginIndex)) { // move to the beginning index = beginIndex; } int prevCurrent = current; current = index - beginIndex; if (current != prevCurrent) { if (scanner != null) { try { scanner.close(); } catch (Exception e) { // no-op } } scanner = scanners[current].getTFileScanner(); } return scanner.seekTo(key); } @Override public void seekToEnd() throws IOException { if (scanner != null) { try { scanner.close(); } catch (Exception e) { // no-op } } scanner = null; current = scanners.length; } @Override public void close() throws IOException { if (!scannerClosed) { scannerClosed = true; if (scanner != null) { try { scanner.close(); scanner = null; } catch (Exception e) { // no-op } } if (closeReader) { Reader.this.close(); } } } } public static class CGRangeSplit implements Writable { int start = 0; // starting index in the list int len = 0; CGRangeSplit(int start, int len) { this.start = start; this.len = len; } public CGRangeSplit() { // no-op; } @Override public void readFields(DataInput in) throws IOException { start = Utils.readVInt(in); len = Utils.readVInt(in); } @Override public void write(DataOutput out) throws IOException { Utils.writeVInt(out, start); Utils.writeVInt(out, len); } } public static class CGRowSplit implements Writable { int length; // number of files in the batch long startByteFirst = -1; long numBytesFirst; long startRowFirst = -1; long numRowsFirst = -1; long numBytesLast = -1; long numRowsLast = -1; String[] names; long[] sizes = null; CGRowSplit(String[] names, long[] sizes, int length, long startFirst, long bytesFirst, long bytesLast) throws IOException { this.names = names; this.sizes = sizes; this.length = length; if (startFirst != -1) { startByteFirst = startFirst; numBytesFirst = bytesFirst; } if (bytesLast != -1 && this.length > 1) { numBytesLast = bytesLast; } } public CGRowSplit() { // no-op; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("{length = " + length + "}\n"); for (int i = 0; i < length; i++) { sb.append("{name = " + names[i] + "}\n"); sb.append("{size = " + sizes[i] + "}\n"); } sb.append("{startByteFirst = " + startByteFirst + "}\n"); sb.append("{numBytesFirst = " + numBytesFirst + "}\n"); sb.append("{startRowFirst = " + startRowFirst + "}\n"); sb.append("{numRowsFirst = " + numRowsFirst + "}\n"); sb.append("{numBytesLast = " + numBytesLast + "}\n"); sb.append("{numRowsLast = " + numRowsLast + "}\n"); return sb.toString(); } @Override public void readFields(DataInput in) throws IOException { length = Utils.readVInt(in); if (length > 0) { names = new String[length]; sizes = new long[length]; } for (int i = 0; i < length; i++) { names[i] = Utils.readString(in); sizes[i] = Utils.readVLong(in); } startByteFirst = Utils.readVLong(in); numBytesFirst = Utils.readVLong(in); startRowFirst = Utils.readVLong(in); numRowsFirst = Utils.readVLong(in); numBytesLast = Utils.readVLong(in); numRowsLast = Utils.readVLong(in); } @Override public void write(DataOutput out) throws IOException { Utils.writeVInt(out, length); for (int i = 0; i < length; i++) { Utils.writeString(out, names[i]); Utils.writeVLong(out, sizes[i]); } Utils.writeVLong(out, startByteFirst); Utils.writeVLong(out, numBytesFirst); Utils.writeVLong(out, startRowFirst); Utils.writeVLong(out, numRowsFirst); Utils.writeVLong(out, numBytesLast); Utils.writeVLong(out, numRowsLast); } } private static class SplitColumn { SplitColumn(Partition.SplitType st) { this.st = st; } SplitColumn(int fieldIndex, Partition.SplitType st) { this.fieldIndex = fieldIndex; this.st = st; } SplitColumn(int fieldIndex, String key, Partition.SplitType st) { this.fieldIndex = fieldIndex; this.key = key; this.st = st; } SplitColumn(int fieldIndex, int projIndex, SplitColumn leaf, String key, Partition.SplitType st) { this(fieldIndex, key, st); this.projIndex = projIndex; } int fieldIndex = -1; // field index to parent int projIndex = -1; // index in projection: only used by leaves SplitColumn leaf = null; String key = null; // MAP key to parent ArrayList<SplitColumn> children = null; int index = -1; // index in the logical schema Object field = null; Partition.SplitType st = Partition.SplitType.NONE; void dispatch(Object field) { this.field = field; } @SuppressWarnings("unchecked") void split() throws ExecException { int size = children.size(); if (st == Partition.SplitType.RECORD) { for (int i = 0; i < size; i++) { if (children.get(i).projIndex != -1) // a leaf: set projection // directly ((Tuple) (leaf.field)).set(projIndex, ((Tuple) field).get(children .get(i).fieldIndex)); else children.get(i).field = ((Tuple) field).get(children.get(i).fieldIndex); } } else if (st == Partition.SplitType.MAP) { for (int i = 0; i < size; i++) { if (children.get(i).projIndex != -1) // a leaf: set projection // directly ((Tuple) (leaf.field)).set(projIndex, ((Map<String, Object>) field) .get(children.get(i).key)); else children.get(i).field = ((Map<String, Object>) field).get(children.get(i).key); } } } void addChild(SplitColumn child) { if (children == null) children = new ArrayList<SplitColumn>(); children.add(child); } } } /** * Column Group writer. */ public static class Writer implements Closeable { Path path; Path finalOutputPath; Configuration conf; FileSystem fs; CGSchema cgschema; private boolean finished, closed; CGIndex index; /** * Create a ColumnGroup writer. The semantics are as follows: * <ol> * <li>If path does not exist: * <ul> * <li>create the path directory * <li>write out the meta data file. * </ul> * <li>If path exists and the directory is empty: write out the meta data * file. * <li>If path exists and contains what look like a complete Column Group, * ColumnGroupExists exception will be thrown. * <li>If path exists and overwrite is true, remove all files under the * directory and resume as in Step 2. * <li>If path exists directory not empty and overwrite= false, * ColumnGroupExists will be thrown. * </ol> * This constructor never removes a valid/complete ColumnGroup. * * @param path * The path to the Column Group, either not existent or must be a * directory. * @param schema * The schema of the ColumnGroup. For this version of * implementation, the schema of a table is a comma separated list * of column names, such as "FirstName, LastName, Sex, Department". * @param sorted * Whether the column group to be created is sorted or not. If set * to true, we expect the rows inserted by every inserter created * from this Writer must be sorted. Additionally, there exists an * ordering of the inserters Ins-1, Ins-2, ... such that the rows * created by Ins-1, followed by rows created by Ins-2, ... form a * total order. * @param overwrite * Should we overwrite the path if it already exists? * @param conf * The optional configuration objects. * @throws IOException */ public Writer(Path path, String schema, boolean sorted, String name, String serializer, String compressor, String owner, String group, short perm,boolean overwrite, Configuration conf) throws IOException, ParseException { this(path, new Schema(schema), sorted, null, name, serializer, compressor, owner, group, perm, overwrite, conf); } public Writer(Path path, Schema schema, boolean sorted, String name, String serializer, String compressor, String owner, String group, short perm,boolean overwrite, Configuration conf) throws IOException, ParseException { this(path, schema, sorted, null, name, serializer, compressor, owner, group, perm, overwrite, conf); } public Writer(Path path, String schema, boolean sorted, String comparator, String name, String serializer, String compressor, String owner, String group, short perm,boolean overwrite, Configuration conf) throws IOException, ParseException { this(path, new Schema(schema), sorted, comparator, name, serializer, compressor, owner, group, perm, overwrite, conf); } public Writer(Path path, Schema schema, boolean sorted, String comparator, String name, String serializer, String compressor, String owner, String group, short perm, boolean overwrite, Configuration conf) throws IOException, ParseException { this.path = path; this.conf = conf; this.finalOutputPath = path; fs = path.getFileSystem(conf); // If meta file already exists, that means the ColumnGroup is complete and // valid, we will not proceed. checkMetaFile(path); // if overwriting, remove everything if (overwrite) { fs.delete(path, true); } // create final output path and temporary output path checkPath(path, true); Path parent = path.getParent(); Path tmpPath1 = new Path(parent, "_temporary"); Path tmpPath2 = new Path(tmpPath1, name); checkPath(tmpPath2, true); cgschema = new CGSchema(schema, sorted, comparator, name, serializer, compressor, owner, group, perm); CGSchema sfNew = CGSchema.load(fs, path); if (sfNew != null) { // sanity check - compare input with on-disk schema. if (!sfNew.equals(cgschema)) { throw new IOException("Schema passed in is different from the one on disk"); } } else { // create the schema file in FS cgschema.create(fs, path); } } /** * Reopen an already created ColumnGroup for writing. It accepts * a temporary path for column group where cginserter can write. * RuntimeException will be thrown if the table is already closed, * or if createMetaBlock() is called by some other process. */ public Writer(Path finalPath, Path workPath, Configuration conf) throws IOException, ParseException { this.path = workPath; finalOutputPath = finalPath; this.conf = conf; fs = path.getFileSystem(conf); checkPath(finalOutputPath, false); checkPath(path, true); checkMetaFile(finalOutputPath); cgschema = CGSchema.load(fs, finalOutputPath); } /* * Reopen an already created ColumnGroup for writing. * It takes in a CGSchema to set its own cgschema instead of going * to disk to fetch this information. */ public Writer(Path finalPath, Path workPath, CGSchema cgschema, Configuration conf) throws IOException, ParseException { this.path = workPath; finalOutputPath = finalPath; this.conf = conf; fs = path.getFileSystem(conf); this.cgschema = cgschema; } /** * Reopen an already created ColumnGroup for writing. RuntimeException will * be thrown if the table is already closed, or if createMetaBlock() is * called by some other process. */ public Writer(Path path, Configuration conf) throws IOException, ParseException { this.path = path; finalOutputPath = path; this.conf = conf; fs = path.getFileSystem(conf); checkPath(path, false); checkMetaFile(path); // read the schema file cgschema = CGSchema.load(fs, path); } /** * Release resources used by the object. Unlike close(), finish() does not * make the table immutable. However, if a user already adds some meta data * into the CG, then finish() would close the column group. */ public void finish() { if (!finished) { finished = true; } } @Override public void close() throws IOException { if (!finished) { finish(); } if (!closed) { closed = true; createIndex(); } } public Schema getSchema() { return cgschema.getSchema(); } /** * Get a inserter with a given name. * * @param name * the name of the inserter. * @param finishWriter * finish the underlying Writer object upon the close of the * Inserter. Should be set to true if there is only one inserter * operate on the table, so we should call finish() after the * Inserter is closed. * * @return A table inserter object. * @throws IOException */ public TableInserter getInserter(String name, boolean finishWriter) throws IOException { return getInserter(name, finishWriter, true); } /** * Get a inserter with a given name. * * @param name * the name of the inserter. * @param finishWriter * finish the underlying Writer object upon the close of the * Inserter. Should be set to true if there is only one inserter * operate on the table, so we should call finish() after the * Inserter is closed. * @param checktype * whether or not do type check. * * @return A table inserter object. * @throws IOException */ public TableInserter getInserter(String name, boolean finishWriter, boolean checkType) throws IOException { if (finished) { throw new IOException("ColumnGroup has been closed for insertion."); } return new CGInserter(name, finishWriter, checkType); } private void createIndex() throws IOException { MetaFile.Writer metaFile = MetaFile.createWriter(makeMetaFilePath(finalOutputPath), conf); index = buildIndex(fs, finalOutputPath, false, conf); DataOutputStream dos = metaFile.createMetaBlock(BLOCK_NAME_INDEX); try { index.write(dos); } finally { dos.close(); } metaFile.close(); } private void checkPath(Path p, boolean createNew) throws IOException { // check existence of path if (!fs.exists(p)) { if (createNew) { fs.mkdirs(p); } else { throw new IOException("Path doesn't exists for appending: " + p); } } if (!fs.getFileStatus(p).isDir()) { throw new IOException("Path exists but not a directory: " + p); } } private void checkMetaFile(Path p) throws IOException { Path pathMeta = new Path(p, META_FILE); if (fs.exists(pathMeta)) { throw new IOException("Index meta file already exists: " + pathMeta); } } /** * Inserter for ColumnGroup */ class CGInserter implements TableInserter { String name; String tmpName; boolean finishWriter; FSDataOutputStream out; TFile.Writer tfileWriter; TupleWriter tupleWriter; boolean closed = true; boolean checkType = true; private void createTempFile() throws IOException { int maxTrial = 10; String prefix = ".tmp." + name + "."; Random random = new Random(); while (true) { /** * Try to set a real random seed by throwing all the runtime * ingredients into it. */ random.setSeed(System.nanoTime() * Thread.currentThread().getId() * Runtime.getRuntime().freeMemory()); try { tmpName = prefix + String.format("%08X", random.nextInt()); Path tmpPath = new Path(path, tmpName); fs.mkdirs(path); if(cgschema.getOwner() != null || cgschema.getGroup() != null) { fs.setOwner(path, cgschema.getOwner(), cgschema.getGroup()); } FsPermission permission = null; if(cgschema.getPerm() != -1) { permission = new FsPermission((short) cgschema.getPerm()); fs.setPermission(path, permission); } out = fs.create(tmpPath, false); if(cgschema.getOwner() != null || cgschema.getGroup() != null) { fs.setOwner(tmpPath, cgschema.getOwner(), cgschema.getGroup()); } if(cgschema.getPerm() != -1) { fs.setPermission(tmpPath, permission); } return; } catch (IOException e) { --maxTrial; if (maxTrial == 0) { throw e; } Thread.yield(); } } } CGInserter(String name, boolean finishWriter, boolean checkType) throws IOException { this.name = name; this.finishWriter = finishWriter; this.tupleWriter = new TupleWriter(getSchema()); this.checkType = checkType; try { createTempFile(); tfileWriter = new TFile.Writer(out, getMinBlockSize(conf), cgschema.getCompressor(), cgschema.getComparator(), conf); closed = false; } finally { if (closed) { if (tfileWriter != null) { try { tfileWriter.close(); } catch (Exception e) { // no-op } } if (out != null) { try { out.close(); } catch (Exception e) { // no-op } } if (tmpName != null) { try { fs.delete(new Path(path, tmpName), false); } catch (Exception e) { // no-op } } } } } @Override public Schema getSchema() { return ColumnGroup.Writer.this.getSchema(); } @Override public void insert(BytesWritable key, Tuple row) throws IOException { /* * If checkType is set to be true, we check for the first row - this is only a sanity check preventing * users from messing up output schema; * If checkType is set to be false, we do not do any type check. */ if (checkType == true) { TypesUtils.checkCompatible(row, getSchema()); checkType = false; } DataOutputStream outKey = tfileWriter.prepareAppendKey(key.getLength()); try { outKey.write(key.getBytes(), 0, key.getLength()); } finally { outKey.close(); } DataOutputStream outValue = tfileWriter.prepareAppendValue(-1); try { tupleWriter.put(outValue, row); } finally { outValue.close(); } } @Override public void close() throws IOException { if (closed) { return; } closed = true; try { // TODO: add schema to each TFile as a meta block? tfileWriter.close(); tfileWriter = null; out.close(); out = null; // do renaming only if all the above is successful. fs.rename(new Path(path, tmpName), new Path(finalOutputPath, name)); /* if(cgschema.getOwner() != null || cgschema.getGroup() != null) { fs.setOwner(new Path(path, name), cgschema.getOwner(), cgschema.getGroup()); } FsPermission permission = null; if(cgschema.getPerm() != -1) { permission = new FsPermission((short) cgschema.getPerm()); fs.setPermission(path, permission); } */ tmpName = null; if (finishWriter) { finish(); } } finally { if (tfileWriter != null) { try { tfileWriter.close(); } catch (Exception e) { // no-op } } if (out != null) { try { out.close(); } catch (Exception e) { // no-op } } if (tmpName != null) { try { fs.delete(new Path(path, tmpName), false); } catch (Exception e) { // no-op } } if (finishWriter) { try { finish(); } catch (Exception e) { // no-op } } } } } } /** * name, first and last key (inclusive) of a data file */ static class CGIndexEntry implements RawComparable, Writable { int index; String name; long rows, bytes; RawComparable firstKey; RawComparable lastKey; // for reading public CGIndexEntry() { // no-op } // for writing public CGIndexEntry(String name, long rows, RawComparable firstKey, RawComparable lastKey) { this.name = name; this.rows = rows; this.firstKey = firstKey; this.lastKey = lastKey; } public int getIndex() { return index; } public String getName() { return name; } public long getRows() { return rows; } public RawComparable getFirstKey() { return firstKey; } public RawComparable getLastKey() { return lastKey; } void setIndex (int idx) { this.index = idx; } @Override public byte[] buffer() { return (lastKey != null) ? lastKey.buffer() : null; } @Override public int offset() { return (lastKey != null) ? lastKey.offset() : 0; } @Override public int size() { return (lastKey != null) ? lastKey.size() : 0; } @Override public void readFields(DataInput in) throws IOException { name = Utils.readString(in); rows = Utils.readVLong(in); if (rows == 0) { firstKey = null; lastKey = null; } else { int firstKeyLen = Utils.readVInt(in); byte[] firstKeyBuffer = new byte[firstKeyLen]; in.readFully(firstKeyBuffer); int lastKeyLen = Utils.readVInt(in); byte[] lastKeyBuffer = new byte[lastKeyLen]; in.readFully(lastKeyBuffer); firstKey = new ByteArray(firstKeyBuffer); lastKey = new ByteArray(lastKeyBuffer); } } @Override public void write(DataOutput out) throws IOException { Utils.writeString(out, name); Utils.writeVLong(out, rows); if (rows > 0) { if ((firstKey == null) && (lastKey == null)) { throw new IOException("In-memory only entry"); } Utils.writeVInt(out, firstKey.size()); out.write(firstKey.buffer(), firstKey.offset(), firstKey.size()); Utils.writeVInt(out, lastKey.size()); out.write(lastKey.buffer(), lastKey.offset(), lastKey.size()); } } } static class CGIndex implements Writable { boolean dirty = false; boolean sorted = true; BasicTableStatus status; ArrayList<CGIndexEntry> index; CGIndex() { status = new BasicTableStatus(); index = new ArrayList<CGIndexEntry>(); } int getFileIndex(Path path) throws IOException { String filename = path.getName(); if (index.isEmpty()) return -1; for (CGIndexEntry cgie : index) { if (cgie.getName().equals(filename)) { return cgie.getIndex(); } } return -1; } int size() { return index.size(); } CGIndexEntry get(int i) { return index.get(i); } List<CGIndexEntry> getIndex() { return index; } Path getPath(int i, Path parent) { return new Path(parent, index.get(i).getName()); } void sort(final Comparator<RawComparable> comparator) throws IOException { if (dirty && comparator != null) { throw new IOException("Cannot sort dirty index"); } if (comparator != null) { // sort by keys. For empty TFiles, they are always sorted before // non-empty TFiles, and they themselves are sorted by their names. Collections.sort(index, new Comparator<CGIndexEntry>() { @Override public int compare(CGIndexEntry o1, CGIndexEntry o2) { if ((o1.getRows() == 0) && (o2.getRows() == 0)) { return o1.getName().compareTo(o2.getName()); } if (o1.getRows() == 0) return -1; if (o2.getRows() == 0) return 1; int cmprv = comparator.compare(o1.lastKey, o2.lastKey); if (cmprv == 0) { cmprv = comparator.compare(o1.firstKey, o2.firstKey); if (cmprv == 0) { cmprv = o1.getName().compareTo(o2.getName()); } } return cmprv; } }); for (int i = 0; i < index.size() - 1; ++i) { RawComparable prevLastKey = index.get(i).lastKey; RawComparable nextFirstKey = index.get(i + 1).firstKey; if (nextFirstKey == null) { continue; } if (comparator.compare(prevLastKey, nextFirstKey) > 0) { throw new IOException("Overlapping key ranges"); } } } else { // sort by name Collections.sort(index, new Comparator<CGIndexEntry>() { @Override public int compare(CGIndexEntry o1, CGIndexEntry o2) { return o1.name.compareTo(o2.name); } }); } // update status if ((!dirty) && (index.size() > 0)) { RawComparable keyFirst = index.get(0).getFirstKey(); status.beginKey = new BytesWritable(); status.beginKey.set(keyFirst.buffer(), keyFirst.offset(), keyFirst .size()); RawComparable keyLast = index.get(index.size() - 1).getLastKey(); status.endKey = new BytesWritable(); status.endKey.set(keyLast.buffer(), keyLast.offset(), keyLast.size()); } sorted = true; } // building full index. void add(long bytes, long rows, CGIndexEntry range) { status.size += bytes; status.rows += rows; index.add(range); sorted = false; range.bytes = bytes; } // building dirty index void add(long bytes, String name) { dirty = true; status.rows = -1; // reset rows to -1. status.size += bytes; CGIndexEntry next = new CGIndexEntry(); next.name = name; index.add(next); sorted = false; next.bytes = bytes; } int lowerBound(RawComparable key, final Comparator<RawComparable> comparator) throws IOException { if ((key == null) || (comparator == null)) { throw new IllegalArgumentException("CGIndex.lowerBound"); } if (!sorted) { sort(comparator); } // Treat null keys as the least key. return Utils.lowerBound(index, key, new Comparator<RawComparable>() { @Override public int compare(RawComparable o1, RawComparable o2) { if ((o1.buffer() == null) && (o2.buffer() == null)) { return 0; } if (o1.buffer() == null) return -1; if (o2.buffer() == null) return 1; return comparator.compare(o1, o2); } }); } @Override public void readFields(DataInput in) throws IOException { int n = Utils.readVInt(in); index.clear(); index.ensureCapacity(n); for (int i = 0; i < n; ++i) { CGIndexEntry range = new CGIndexEntry(); range.readFields(in); range.setIndex(i); index.add(range); } status.readFields(in); dirty = false; sorted = true; } @Override public void write(DataOutput out) throws IOException { if (dirty) { throw new IOException("Cannot write dirty index"); } if (!sorted) { throw new IOException("Please sort index before calling write"); } Utils.writeVInt(out, index.size()); for (int i = 0; i < index.size(); ++i) { index.get(i).write(out); } status.write(out); } } public static class CGPathFilter implements PathFilter { private static Configuration conf; public static void setConf(Configuration c) { conf = c; } public boolean accept(Path p) { return p.getName().equals(META_FILE) || p.getName().equals(SCHEMA_FILE) || p.getName().startsWith(".tmp.") || p.getName().startsWith("_") || p.getName().startsWith("ttt") || p.getName().startsWith(getNonDataFilePrefix(conf)) ? false : true; } } /** * Dump information about CG. * * @param file * Path string of the CG * @param out * PrintStream to output the information. * @param conf * The configuration object. * @throws IOException */ static public void dumpInfo(String file, PrintStream out, Configuration conf) throws IOException, Exception { // final int maxKeySampleLen = 16; dumpInfo(new Path(file), out, conf); } static public void dumpInfo(Path path, PrintStream out, Configuration conf) throws IOException, Exception { dumpInfo(path, out, conf, 0); } static public void dumpInfo(Path path, PrintStream out, Configuration conf, int indent) throws IOException, Exception { // final int maxKeySampleLen = 16; IOutils.indent(out, indent); out.println(); IOutils.indent(out, indent); out.println("Column Group : " + path); ColumnGroup.Reader reader = new ColumnGroup.Reader(path, false, conf); try { LinkedHashMap<String, String> properties = new LinkedHashMap<String, String>(); IOutils.indent(out, indent); out.println("Name: " + reader.getName()); IOutils.indent(out, indent); out.println("Serializer: " + reader.getSerializer()); IOutils.indent(out, indent); out.println("Compressor: " + reader.getCompressor()); IOutils.indent(out, indent); out.println("Group: " + reader.getGroup()); IOutils.indent(out, indent); out.println("Perm: " + reader.getPerm()); properties.put("Schema", reader.getSchema().toString()); // Now output the properties table. int maxKeyLength = 0; Set<Map.Entry<String, String>> entrySet = properties.entrySet(); for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it .hasNext();) { Map.Entry<String, String> e = it.next(); if (e.getKey().length() > maxKeyLength) { maxKeyLength = e.getKey().length(); } } for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it .hasNext();) { Map.Entry<String, String> e = it.next(); IOutils.indent(out, indent); out.printf("%s : %s\n", e.getKey(), e.getValue()); } out.println("TFiles within the Column Group :"); if (reader.cgindex == null) reader.cgindex = buildIndex(reader.fs, reader.path, reader.dirty, conf); for (CGIndexEntry entry : reader.cgindex.index) { IOutils.indent(out, indent); out.printf(" *Name : %s\n", entry.name); IOutils.indent(out, indent); out.printf(" Rows : %d\n", entry.rows); if (entry.firstKey != null) { IOutils.indent(out, indent); out.printf(" First Key : %s\n", headToString(entry.firstKey)); } if (entry.lastKey != null) { IOutils.indent(out, indent); out.printf(" Larst Key : %s\n", headToString(entry.lastKey)); } // dump TFile info // Path pathTFile = new Path(path, entry.name); // TFile.dumpInfo(pathTFile.toString(), out, conf); } } finally { try { reader.close(); } catch (Exception e) { // no-op } } } private static String headToString(RawComparable raw) { return new String(raw.buffer(), raw.offset(), raw.size() > 70 ? 70 : raw .size()); } /** * Dumping the CG information. * * @param args * A list of CG paths. */ public static void main(String[] args) throws Exception { System.out.printf("ColumnGroup Dumper\n"); if (args.length == 0) { System.out .println("Usage: java ... org.apache.hadoop.zebra.io.ColumnGroup cg-path [cg-path ...]"); System.exit(0); } Configuration conf = new Configuration(); for (String file : args) { try { dumpInfo(file, System.out, conf); } catch (IOException e) { e.printStackTrace(System.err); } } } }