/* Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.bfs; import java.io.Externalizable; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.nio.ByteBuffer; import java.util.Arrays; import org.apache.log4j.Level; import org.apache.log4j.Logger; import com.bigdata.btree.AbstractBTree; import com.bigdata.btree.BTree; import com.bigdata.btree.IIndex; import com.bigdata.btree.ILinearList; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.keys.IKeyBuilder; import com.bigdata.btree.keys.KeyBuilder; import com.bigdata.btree.proc.IResultHandler; import com.bigdata.btree.proc.ISimpleIndexProcedure; import com.bigdata.btree.view.FusedView; import com.bigdata.io.DataOutputBuffer; import com.bigdata.journal.AbstractJournal; import com.bigdata.journal.Journal; import com.bigdata.util.Bytes; import com.bigdata.util.BytesUtil; /** * Atomic append of a single block to a file version. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ public class AtomicBlockAppendProc implements ISimpleIndexProcedure<Object>, Externalizable { private static final long serialVersionUID = 1441331704737671258L; protected static transient Logger log = Logger .getLogger(AtomicBlockAppendProc.class); /** * True iff the {@link #log} level is INFO or less. */ final protected static transient boolean INFO = log.getEffectiveLevel() .toInt() <= Level.INFO.toInt(); /** * True iff the {@link #log} level is DEBUG or less. */ final protected static transient boolean DEBUG = log.getEffectiveLevel() .toInt() <= Level.DEBUG.toInt(); private String id; private int version; private int off; private int len; private byte[] b; @Override public final boolean isReadOnly() { return false; } /** * * @param id * The file identifier. * @param version * The file version. * @param b * The buffer containing the data to be written. * @param off * The offset in the buffer of the first byte to be written. * @param len * The #of bytes to be written. */ public AtomicBlockAppendProc(BigdataFileSystem repo, String id, int version, byte[] b, int off, int len) { assert id != null && id.length() > 0; assert version >= 0; assert b != null; assert off >= 0 : "off="+off; assert len >= 0 && off + len <= b.length; assert len <= repo.getBlockSize(): "len="+len+" exceeds blockSize="+repo.getBlockSize(); this.id = id; this.version = version; this.off = off; this.len = len; this.b = b; } /** * This procedure runs on the unisolated index. The block identifier is * computed as a one up long integer for that file version using locally * available state. The raw data for the block is written directly onto * the {@link Journal} and an index entry is added for the file, * version, and block whose value is the address of the block's data on * the {@link Journal}. * <p> * Note: The caller MUST have correctly identified the data service on * which the tail of the file exists (or on which the head of the file * will be written). * <p> * The block identifier is computed by reading and decoding the key for * the last block written for this file version (if any). Special cases * exist when the file version spans more than one index partition, when * the block would be the first block (in key order) for the index * partition, and when the block would be the last block (in key order) * for the index partition. * * @return <code>true</code> iff the block was overwritten. */ @Override public Object apply(final IIndex ndx) { // tunnel through to the backing journal. final AbstractJournal journal = (AbstractJournal)((AbstractBTree)ndx).getStore(); // obtain the thread-local key builder for that journal. final IKeyBuilder keyBuilder = ndx.getIndexMetadata().getKeyBuilder(); /* * The next block identifier to be assigned. */ final long block = getNextBlockIdentifierInFileVersion(ndx, keyBuilder); if (log.isInfoEnabled()) log.info("Will write " + len + " bytes on id=" + id + ", version=" + version + ", block#=" + block); { /* * write the block on the journal obtaining the address at which * it was written - use 0L for the address of an empty block. */ final long addr = len == 0 ? 0L : journal.write(ByteBuffer .wrap(b, off, len)); // form the key for the index entry for this block. final byte[] key = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append( version).append(block).getKey(); // record the address of the block in the index. { final DataOutputBuffer out = new DataOutputBuffer( Bytes.SIZEOF_LONG); // encode the value for the entry. out.reset().putLong(addr); final byte[] val = out.toByteArray(); // insert the entry into the index. ndx.insert(key, val); } if (log.isInfoEnabled()) log.info("Wrote " + len + " bytes : id=" + id + ", version=" + version + ", block#=" + block + " @ addr" + journal.toString(addr)); } // the block identifier. return block; } /** * Find the key for the last block written for this file version. We do this * by forming a probe key from the file, version, and the maximum allowed * block identifier. This is guarenteed to be after any existing block for * that file and version. * <p> * Note: This implementation uses an {@link IRangeQuery#REVERSE} iterator to * locate the last block in the file and is capable of scale-out. * * @todo This implies that the leftSeparator for the index partition MUST * NOT split the blocks for a file unless there is at least one block * in the index partition. In practice this guarentee is easy to * maintain. By default we choose to split an index partition on a * file boundary. If that would result in an uneven split (or an empty * split in the case of very large files) then we choose a split point * that lies within the file's data - leaving at least one block for * the file (probably many) in both partitions created by the split. * * @param ndx * @param keyBuilder * @return */ protected long getNextBlockIdentifierInFileVersion2(IIndex ndx, IKeyBuilder keyBuilder) { final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append( version).append(0/*first valid block*/).getKey(); final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append( version).append(Long.MAX_VALUE/*max block*/).getKey(); ITupleIterator itr = ndx .rangeIterator(fromKey, toKey, 1/* capacity */, IRangeQuery.KEYS | IRangeQuery.REVERSE, null/*filter*/); if(!itr.hasNext()) { // There are no blocks for this file version. return 0L; } final byte[] key = itr.next().getKey(); return getNextBlockFromPriorKey(keyBuilder, key); } /** * Find the key for the last block written for this file version. We do this * by forming a probe key from the file, version, and the maximum allowed * block identifier. This is guarenteed to be after any existing block for * that file and version. * <p> * Note: This implementation uses the {@link ILinearList} API to locate the * last block in the file and is NOT capable of scale-out since that API is * NOT available for an index partition view (a {@link FusedView}). * * @todo This implies that the leftSeparator for the index partition MUST * NOT split the blocks for a file unless there is at least one block * in the index partition. In practice this guarentee is easy to * maintain. By default we choose to split an index partition on a * file boundary. If that would result in an uneven split (or an empty * split in the case of very large files) then we choose a split point * that lies within the file's data - leaving at least one block for * the file (probably many) in both partitions created by the split. */ protected long getNextBlockIdentifierInFileVersion(IIndex ndx, IKeyBuilder keyBuilder) { final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append( version).append(Long.MAX_VALUE).getKey(); // Note: uses the ILinearList API. final ILinearList tmp = (ILinearList) ndx; /* * Index of the first key after this file version. * * Note: This will always be an insertion point (a negative * value) since the toKey only encodes the successor of the file * identifier. * * We convert the insertion point to an index. * * If the index is zero (0) then there are no blocks for this * file and the file will be the first file in the index order * on this index partition (there may or may not be other files * already on the index partition). * * Else fetch the key at that index. If that key encodes the * same id as this file then we are appending to a file with * existing block(s) and we decode the block identifier from the * key. Otherwise this will be the first block written for that * file. */ long toIndex = tmp.indexOf(toKey); assert toIndex < 0 : "Expecting insertion point: id=" + id + ", version=" + version + ", toIndex=" + toIndex; if (log.isDebugEnabled()) log.debug("insertionPoint=" + toIndex); toIndex = -(toIndex + 1); // convert to an index. // #of entries in the index. final long entryCount = ((AbstractBTree) ndx).getEntryCount(); if (log.isDebugEnabled()) log.debug("toIndex=" + toIndex + ", entryCount=" + entryCount); final long block; if (toIndex == 0) { /* * Insertion point is before all other entries in the index. * * Note: In this case we need to examine the leftSeparator * key for the index partition. If that key is for the same * file version then we use the successor of the block * identifier found in that key. * * Note: when it is not for the same file version it MAY be * that the leftSeparator does not include the block * identifier - the block identifier is only required in the * leftSeparator when the a file version spans both the * prior index partition and this index partition. */ if(log.isDebugEnabled()) log.debug("Insertion point is before all entries in the index partition: id=" + id + ", version=" + version); final byte[] leftSeparator = ((BTree) ndx) .getIndexMetadata().getPartitionMetadata() .getLeftSeparatorKey(); block = getNextBlockFromPriorKey(keyBuilder, leftSeparator); } else { if (toIndex == entryCount) { /* * Insertion point is after all entries in the index. * * Note: In this case we consider the prior key in the * index partition. If that key is for the same file * version then we use the successor of the block * identifier found in that key. */ if (log.isDebugEnabled()) log.debug("Insertion point is after all entries in the index partition: id=" + id + ", version=" + version); } else { /* * Insertion point is at the toKey. * * Note: Since the probe key is beyond the last block * for the file version we adjust the toIndex so that we * consider the prior key. */ if (log.isDebugEnabled()) log.debug("Insertion point is at the toKey: id=" + id + ", version=" + version); } /* * Adjust to consider the key before the insertion point. */ toIndex--; /* * Look at the key at the computed index. If it is a key for * this file version then we use the successor of the given * block identifier. Otherwise we are writing a new file * version and the block identifier will be zero (0). */ if (log.isDebugEnabled()) log.debug("adjusted toIndex="+toIndex+", entryCount="+entryCount); // the key at that index. final byte[] key = tmp.keyAt(toIndex); assert key != null : "Expecting entry: id=" + id + ", version=" + version + ", toIndex=" + toIndex; block = getNextBlockFromPriorKey(keyBuilder, key); } return block; } /** * Decode the block identifier in the key and return the block * identifier plus one, which is the block identifier to be used for the * atomic append operation. If the key does NOT encode the same file + * version then no blocks exist for that file version and the method * returns zero (0L) as the block identifer to be used. * * @param keyBuilder * The key builder. * @param key * The key - either from the index partition or in some cases * from the leftSeparator of the index partition metadata. * <p> * Note that the leftSeparator MAY be an empty byte[] (e.g., * for the 1st index partition in the key order) and MIGHT * NOT include the block identifier (the block identifier is * only included when it is necessary to split a file across * index partitions). When the block identifier is omitted * from the key and the key encodes the same file and version * we therefore use zero (0L) as the next block identifier * since we will be appending the first block to the file * version. * * @return The block identifier that will be used by the atomic append * operation. */ protected long getNextBlockFromPriorKey(IKeyBuilder keyBuilder, byte[] key) { // encode just the file id and the version. final byte[] prefix = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .getKey(); if (DEBUG) log.debug("Comparing\nkey :" + Arrays.toString(key) + "\nprefix:" + Arrays.toString(prefix)); /* * Test the encoded file id and version against the encoded file id * and version in the recovered key. If they compare equals (for the * length of the key that we just built) then they encode the same * file id and version. * * (I.e., if true, then the key is from a block entry for this * version of this file). */ if (key.length >= prefix.length) { final int cmp = BytesUtil.compareBytesWithLenAndOffset(0, prefix.length, prefix, 0, prefix.length, key); if(DEBUG) log.debug("Comparing " + prefix.length + " byte prefix with " + key.length + " byte key: cmp=" + cmp); if (cmp == 0) { /* * The key at the computed toIndex is the same file version. */ if (prefix.length + Bytes.SIZEOF_LONG == key.length) { /* * The given key includes a block identifier so we * extract it. * * Note: When the given key is a leftSeparator for an * index partition AND the file version is not split * across the index partition then the block identifer * MAY be omitted from the leftSeparator. In this case * the block identifier will be zero since there are no * blocks yet for that file version. */ // last block identifier assigned for this file + 1. final long block = KeyBuilder.decodeLong(key, key.length - Bytes.SIZEOF_LONG) + 1; if (block > BigdataFileSystem.MAX_BLOCK) { throw new RuntimeException( "File version has maximum #of blocks: id=" + id + ", version=" + version); } if(INFO) log.info("Appending to existing file version: id=" + id + ", version=" + version + ", block=" + block); return block; } else { /* * This case arises when the leftSeparator encodes the * file version but does not include a block identifier. */ if(INFO) log.info("Key is for same file version but does not contain block identifier."); } } else { /* * Since the key does not compare as equal for the full * length of the prefix it can not encode the same file * version. */ if(DEBUG) log.debug("Key does not compare as equal for length of prefix."); } } else { /* * Since the key is shorter than the prefix it can not be for * the same file version. */ log.debug("Key is shorter than prefix."); } /* * The key at computed toIndex is a different file version so we are * starting a new file version at block := 0. */ if(INFO) log.info("Appending to new file version: id=" + id + ", version=" + version + ", block=" + 0L); return 0L; } @Override public void readExternal(final ObjectInput in) throws IOException, ClassNotFoundException { id = in.readUTF(); version = in.readInt(); off = 0; // Note: offset always zero when de-serialized. len = in.readInt(); b = new byte[len]; in.readFully(b); } @Override public void writeExternal(final ObjectOutput out) throws IOException { out.writeUTF(id); out.writeInt(version); /* * Note: offset not written when serialized and always zero when * de-serialized. */ out.writeInt(len); /* length */ out.write(b, off, len); /* data */ } }