/*
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import java.io.EOFException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.text.ParseException;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.DroppedSnapshotException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.UnknownScannerException;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Row;
import org.apache.hadoop.hbase.client.RowLock;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.io.TimeRange;
import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.regionserver.wal.HLog;
import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CancelableProgressable;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.CompressionTest;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import com.google.common.collect.Lists;
/**
* HRegion stores data for a certain region of a table. It stores all columns
* for each row. A given table consists of one or more HRegions.
*
* <p>We maintain multiple HStores for a single HRegion.
*
* <p>An Store is a set of rows with some column data; together,
* they make up all the data for the rows.
*
* <p>Each HRegion has a 'startKey' and 'endKey'.
* <p>The first is inclusive, the second is exclusive (except for
* the final region) The endKey of region 0 is the same as
* startKey for region 1 (if it exists). The startKey for the
* first region is null. The endKey for the final region is null.
*
* <p>Locking at the HRegion level serves only one purpose: preventing the
* region from being closed (and consequently split) while other operations
* are ongoing. Each row level operation obtains both a row lock and a region
* read lock for the duration of the operation. While a scanner is being
* constructed, getScanner holds a read lock. If the scanner is successfully
* constructed, it holds a read lock until it is closed. A close takes out a
* write lock and consequently will block for ongoing operations and will block
* new operations from starting while the close is in progress.
*
* <p>An HRegion is defined by its table and its key extent.
*
* <p>It consists of at least one Store. The number of Stores should be
* configurable, so that data which is accessed together is stored in the same
* Store. Right now, we approximate that by building a single Store for
* each column family. (This config info will be communicated via the
* tabledesc.)
*
* <p>The HTableDescriptor contains metainfo about the HRegion's table.
* regionName is a unique identifier for this HRegion. (startKey, endKey]
* defines the keyspace for this HRegion.
*/
public class HRegion implements HeapSize { // , Writable{
public static final Log LOG = LogFactory.getLog(HRegion.class);
static final String MERGEDIR = "merges";
final AtomicBoolean closed = new AtomicBoolean(false);
/* Closing can take some time; use the closing flag if there is stuff we don't
* want to do while in closing state; e.g. like offer this region up to the
* master as a region to close if the carrying regionserver is overloaded.
* Once set, it is never cleared.
*/
final AtomicBoolean closing = new AtomicBoolean(false);
//////////////////////////////////////////////////////////////////////////////
// Members
//////////////////////////////////////////////////////////////////////////////
private final Set<byte[]> lockedRows =
new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
private final Map<Integer, byte []> lockIds =
new HashMap<Integer, byte []>();
private int lockIdGenerator = 1;
static private Random rand = new Random();
protected final Map<byte [], Store> stores =
new ConcurrentSkipListMap<byte [], Store>(Bytes.BYTES_RAWCOMPARATOR);
//These variable are just used for getting data out of the region, to test on
//client side
// private int numStores = 0;
// private int [] storeSize = null;
// private byte [] name = null;
final AtomicLong memstoreSize = new AtomicLong(0);
/**
* The directory for the table this region is part of.
* This directory contains the directory for this region.
*/
final Path tableDir;
final HLog log;
final FileSystem fs;
final Configuration conf;
final HRegionInfo regionInfo;
final Path regiondir;
KeyValue.KVComparator comparator;
/*
* Set this when scheduling compaction if want the next compaction to be a
* major compaction. Cleared each time through compaction code.
*/
private volatile boolean forceMajorCompaction = false;
private Pair<Long,Long> lastCompactInfo = null;
// Used to ensure only one thread closes region at a time.
private final Object closeLock = new Object();
/*
* Data structure of write state flags used coordinating flushes,
* compactions and closes.
*/
static class WriteState {
// Set while a memstore flush is happening.
volatile boolean flushing = false;
// Set when a flush has been requested.
volatile boolean flushRequested = false;
// Set while a compaction is running.
volatile boolean compacting = false;
// Gets set in close. If set, cannot compact or flush again.
volatile boolean writesEnabled = true;
// Set if region is read-only
volatile boolean readOnly = false;
/**
* Set flags that make this region read-only.
*
* @param onOff flip value for region r/o setting
*/
synchronized void setReadOnly(final boolean onOff) {
this.writesEnabled = !onOff;
this.readOnly = onOff;
}
boolean isReadOnly() {
return this.readOnly;
}
boolean isFlushRequested() {
return this.flushRequested;
}
}
final WriteState writestate = new WriteState();
final long memstoreFlushSize;
private volatile long lastFlushTime;
private List<Pair<Long,Long>> recentFlushes = new ArrayList<Pair<Long,Long>>();
final FlushRequester flushRequester;
private final long blockingMemStoreSize;
final long threadWakeFrequency;
// Used to guard closes
final ReentrantReadWriteLock lock =
new ReentrantReadWriteLock();
// Stop updates lock
private final ReentrantReadWriteLock updatesLock =
new ReentrantReadWriteLock();
private boolean splitRequest;
private final ReadWriteConsistencyControl rwcc =
new ReadWriteConsistencyControl();
/**
* Name of the region info file that resides just under the region directory.
*/
public final static String REGIONINFO_FILE = ".regioninfo";
/**
* Should only be used for testing purposes
*/
public HRegion(){
this.tableDir = null;
this.blockingMemStoreSize = 0L;
this.conf = null;
this.flushRequester = null;
this.fs = null;
this.memstoreFlushSize = 0L;
this.log = null;
this.regiondir = null;
this.regionInfo = null;
this.threadWakeFrequency = 0L;
}
/**
* HRegion constructor. his constructor should only be used for testing and
* extensions. Instances of HRegion should be instantiated with the
* {@link HRegion#newHRegion(Path, HLog, FileSystem, Configuration, org.apache.hadoop.hbase.HRegionInfo, FlushRequester)} method.
*
*
* @param tableDir qualified path of directory where region should be located,
* usually the table directory.
* @param log The HLog is the outbound log for any updates to the HRegion
* (There's a single HLog for all the HRegions on a single HRegionServer.)
* The log file is a logfile from the previous execution that's
* custom-computed for this HRegion. The HRegionServer computes and sorts the
* appropriate log info for this HRegion. If there is a previous log file
* (implying that the HRegion has been written-to before), then read it from
* the supplied path.
* @param fs is the filesystem.
* @param conf is global configuration settings.
* @param regionInfo - HRegionInfo that describes the region
* is new), then read them from the supplied path.
* @param flushRequester an object that implements {@link FlushRequester} or null
*
* @see HRegion#newHRegion(Path, HLog, FileSystem, Configuration, org.apache.hadoop.hbase.HRegionInfo, FlushRequester)
*/
public HRegion(Path tableDir, HLog log, FileSystem fs, Configuration conf,
HRegionInfo regionInfo, FlushRequester flushRequester) {
this.tableDir = tableDir;
this.comparator = regionInfo.getComparator();
this.log = log;
this.fs = fs;
this.conf = conf;
this.regionInfo = regionInfo;
this.flushRequester = flushRequester;
this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY,
10 * 1000);
String encodedNameStr = this.regionInfo.getEncodedName();
this.regiondir = getRegionDir(this.tableDir, encodedNameStr);
long flushSize = regionInfo.getTableDesc().getMemStoreFlushSize();
if (flushSize == HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE) {
flushSize = conf.getLong("hbase.hregion.memstore.flush.size",
HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
}
this.memstoreFlushSize = flushSize;
this.blockingMemStoreSize = this.memstoreFlushSize *
conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
if (LOG.isDebugEnabled()) {
// Write out region name as string and its encoded name.
LOG.debug("Instantiated " + this);
}
}
/**
* Initialize this region.
* @return What the next sequence (edit) id should be.
* @throws IOException e
*/
public long initialize() throws IOException {
return initialize(null);
}
/**
* Initialize this region.
*
* @param reporter Tickle every so often if initialize is taking a while.
* @return What the next sequence (edit) id should be.
* @throws IOException e
*/
public long initialize(final CancelableProgressable reporter)
throws IOException {
// A region can be reopened if failed a split; reset flags
this.closing.set(false);
this.closed.set(false);
// Write HRI to a file in case we need to recover .META.
checkRegioninfoOnFilesystem();
// Remove temporary data left over from old regions
cleanupTmpDir();
// Load in all the HStores. Get maximum seqid.
long maxSeqId = -1;
for (HColumnDescriptor c : this.regionInfo.getTableDesc().getFamilies()) {
Store store = instantiateHStore(this.tableDir, c);
this.stores.put(c.getName(), store);
long storeSeqId = store.getMaxSequenceId();
if (storeSeqId > maxSeqId) {
maxSeqId = storeSeqId;
}
}
// Recover any edits if available.
maxSeqId = replayRecoveredEditsIfAny(this.regiondir, maxSeqId, reporter);
// Get rid of any splits or merges that were lost in-progress. Clean out
// these directories here on open. We may be opening a region that was
// being split but we crashed in the middle of it all.
SplitTransaction.cleanupAnySplitDetritus(this);
FSUtils.deleteDirectory(this.fs, new Path(regiondir, MERGEDIR));
this.writestate.setReadOnly(this.regionInfo.getTableDesc().isReadOnly());
this.writestate.compacting = false;
this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
// Use maximum of log sequenceid or that which was found in stores
// (particularly if no recovered edits, seqid will be -1).
long nextSeqid = maxSeqId + 1;
LOG.info("Onlined " + this.toString() + "; next sequenceid=" + nextSeqid);
return nextSeqid;
}
/*
* Move any passed HStore files into place (if any). Used to pick up split
* files and any merges from splits and merges dirs.
* @param initialFiles
* @throws IOException
*/
static void moveInitialFilesIntoPlace(final FileSystem fs,
final Path initialFiles, final Path regiondir)
throws IOException {
if (initialFiles != null && fs.exists(initialFiles)) {
if (!fs.rename(initialFiles, regiondir)) {
LOG.warn("Unable to rename " + initialFiles + " to " + regiondir);
}
}
}
/**
* @return True if this region has references.
*/
public boolean hasReferences() {
for (Store store : this.stores.values()) {
for (StoreFile sf : store.getStorefiles()) {
// Found a reference, return.
if (sf.isReference()) return true;
}
}
return false;
}
/*
* Write out an info file under the region directory. Useful recovering
* mangled regions.
* @throws IOException
*/
private void checkRegioninfoOnFilesystem() throws IOException {
Path regioninfoPath = new Path(this.regiondir, REGIONINFO_FILE);
if (this.fs.exists(regioninfoPath) &&
this.fs.getFileStatus(regioninfoPath).getLen() > 0) {
return;
}
// Create in tmpdir and then move into place in case we crash after
// create but before close. If we don't successfully close the file,
// subsequent region reopens will fail the below because create is
// registered in NN.
Path tmpPath = new Path(getTmpDir(), REGIONINFO_FILE);
FSDataOutputStream out = this.fs.create(tmpPath, true);
try {
this.regionInfo.write(out);
out.write('\n');
out.write('\n');
out.write(Bytes.toBytes(this.regionInfo.toString()));
} finally {
out.close();
}
if (!fs.rename(tmpPath, regioninfoPath)) {
throw new IOException("Unable to rename " + tmpPath + " to " +
regioninfoPath);
}
}
/** @return a HRegionInfo object for this region */
public HRegionInfo getRegionInfo() {
return this.regionInfo;
}
/** @return true if region is closed */
public boolean isClosed() {
return this.closed.get();
}
/**
* @return True if closing process has started.
*/
public boolean isClosing() {
return this.closing.get();
}
boolean areWritesEnabled() {
synchronized(this.writestate) {
return this.writestate.writesEnabled;
}
}
public ReadWriteConsistencyControl getRWCC() {
return rwcc;
}
/**
* Close down this HRegion. Flush the cache, shut down each HStore, don't
* service any more calls.
*
* <p>This method could take some time to execute, so don't call it from a
* time-sensitive thread.
*
* @return Vector of all the storage files that the HRegion's component
* HStores make use of. It's a list of all HStoreFile objects. Returns empty
* vector if already closed and null if judged that it should not close.
*
* @throws IOException e
*/
public List<StoreFile> close() throws IOException {
return close(false);
}
/**
* Close down this HRegion. Flush the cache unless abort parameter is true,
* Shut down each HStore, don't service any more calls.
*
* This method could take some time to execute, so don't call it from a
* time-sensitive thread.
*
* @param abort true if server is aborting (only during testing)
* @return Vector of all the storage files that the HRegion's component
* HStores make use of. It's a list of HStoreFile objects. Can be null if
* we are not to close at this time or we are already closed.
*
* @throws IOException e
*/
public List<StoreFile> close(final boolean abort) throws IOException {
// Only allow one thread to close at a time. Serialize them so dual
// threads attempting to close will run up against each other.
synchronized (closeLock) {
return doClose(abort);
}
}
private List<StoreFile> doClose(final boolean abort)
throws IOException {
if (isClosed()) {
LOG.warn("Region " + this + " already closed");
return null;
}
boolean wasFlushing = false;
synchronized (writestate) {
// Disable compacting and flushing by background threads for this
// region.
writestate.writesEnabled = false;
wasFlushing = writestate.flushing;
LOG.debug("Closing " + this + ": disabling compactions & flushes");
while (writestate.compacting || writestate.flushing) {
LOG.debug("waiting for" +
(writestate.compacting ? " compaction" : "") +
(writestate.flushing ?
(writestate.compacting ? "," : "") + " cache flush" :
"") + " to complete for region " + this);
try {
writestate.wait();
} catch (InterruptedException iex) {
// continue
}
}
}
// If we were not just flushing, is it worth doing a preflush...one
// that will clear out of the bulk of the memstore before we put up
// the close flag?
if (!abort && !wasFlushing && worthPreFlushing()) {
LOG.info("Running close preflush of " + this.getRegionNameAsString());
internalFlushcache();
}
this.closing.set(true);
lock.writeLock().lock();
try {
if (this.isClosed()) {
// SplitTransaction handles the null
return null;
}
LOG.debug("Updates disabled for region " + this);
// Don't flush the cache if we are aborting
if (!abort) {
internalFlushcache();
}
List<StoreFile> result = new ArrayList<StoreFile>();
for (Store store : stores.values()) {
result.addAll(store.close());
}
this.closed.set(true);
LOG.info("Closed " + this);
return result;
} finally {
lock.writeLock().unlock();
}
}
/**
* @return True if its worth doing a flush before we put up the close flag.
*/
private boolean worthPreFlushing() {
return this.memstoreSize.get() >
this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
}
//////////////////////////////////////////////////////////////////////////////
// HRegion accessors
//////////////////////////////////////////////////////////////////////////////
/** @return start key for region */
public byte [] getStartKey() {
return this.regionInfo.getStartKey();
}
/** @return end key for region */
public byte [] getEndKey() {
return this.regionInfo.getEndKey();
}
/** @return region id */
public long getRegionId() {
return this.regionInfo.getRegionId();
}
/** @return region name */
public byte [] getRegionName() {
return this.regionInfo.getRegionName();
}
/** @return region name as string for logging */
public String getRegionNameAsString() {
return this.regionInfo.getRegionNameAsString();
}
/** @return HTableDescriptor for this region */
public HTableDescriptor getTableDesc() {
return this.regionInfo.getTableDesc();
}
/** @return HLog in use for this region */
public HLog getLog() {
return this.log;
}
/** @return Configuration object */
public Configuration getConf() {
return this.conf;
}
/** @return region directory Path */
public Path getRegionDir() {
return this.regiondir;
}
/**
* Computes the Path of the HRegion
*
* @param tabledir qualified path for table
* @param name ENCODED region name
* @return Path of HRegion directory
*/
public static Path getRegionDir(final Path tabledir, final String name) {
return new Path(tabledir, name);
}
/** @return FileSystem being used by this region */
public FileSystem getFilesystem() {
return this.fs;
}
/** @return info about the last compaction <time, size> */
public Pair<Long,Long> getLastCompactInfo() {
return this.lastCompactInfo;
}
/** @return the last time the region was flushed */
public long getLastFlushTime() {
return this.lastFlushTime;
}
/** @return info about the last flushes <time, size> */
public List<Pair<Long,Long>> getRecentFlushInfo() {
this.lock.readLock().lock();
List<Pair<Long,Long>> ret = this.recentFlushes;
this.recentFlushes = new ArrayList<Pair<Long,Long>>();
this.lock.readLock().unlock();
return ret;
}
//////////////////////////////////////////////////////////////////////////////
// HRegion maintenance.
//
// These methods are meant to be called periodically by the HRegionServer for
// upkeep.
//////////////////////////////////////////////////////////////////////////////
/** @return returns size of largest HStore. */
public long getLargestHStoreSize() {
long size = 0;
for (Store h: stores.values()) {
long storeSize = h.getSize();
if (storeSize > size) {
size = storeSize;
}
}
return size;
}
/*
* Do preparation for pending compaction.
* @throws IOException
*/
void doRegionCompactionPrep() throws IOException {
}
/*
* Removes the temporary directory for this Store.
*/
private void cleanupTmpDir() throws IOException {
FSUtils.deleteDirectory(this.fs, getTmpDir());
}
/**
* Get the temporary diretory for this region. This directory
* will have its contents removed when the region is reopened.
*/
Path getTmpDir() {
return new Path(getRegionDir(), ".tmp");
}
void setForceMajorCompaction(final boolean b) {
this.forceMajorCompaction = b;
}
boolean getForceMajorCompaction() {
return this.forceMajorCompaction;
}
/**
* Called by compaction thread and after region is opened to compact the
* HStores if necessary.
*
* <p>This operation could block for a long time, so don't call it from a
* time-sensitive thread.
*
* Note that no locking is necessary at this level because compaction only
* conflicts with a region split, and that cannot happen because the region
* server does them sequentially and not in parallel.
*
* @return mid key if split is needed
* @throws IOException e
*/
public byte [] compactStores() throws IOException {
boolean majorCompaction = this.forceMajorCompaction;
this.forceMajorCompaction = false;
return compactStores(majorCompaction);
}
/*
* Called by compaction thread and after region is opened to compact the
* HStores if necessary.
*
* <p>This operation could block for a long time, so don't call it from a
* time-sensitive thread.
*
* Note that no locking is necessary at this level because compaction only
* conflicts with a region split, and that cannot happen because the region
* server does them sequentially and not in parallel.
*
* @param majorCompaction True to force a major compaction regardless of thresholds
* @return split row if split is needed
* @throws IOException e
*/
byte [] compactStores(final boolean majorCompaction)
throws IOException {
if (this.closing.get()) {
LOG.debug("Skipping compaction on " + this + " because closing");
return null;
}
lock.readLock().lock();
this.lastCompactInfo = null;
try {
if (this.closed.get()) {
LOG.debug("Skipping compaction on " + this + " because closed");
return null;
}
byte [] splitRow = null;
if (this.closed.get()) {
return splitRow;
}
try {
synchronized (writestate) {
if (!writestate.compacting && writestate.writesEnabled) {
writestate.compacting = true;
} else {
LOG.info("NOT compacting region " + this +
": compacting=" + writestate.compacting + ", writesEnabled=" +
writestate.writesEnabled);
return splitRow;
}
}
LOG.info("Starting" + (majorCompaction? " major " : " ") +
"compaction on region " + this);
long startTime = EnvironmentEdgeManager.currentTimeMillis();
doRegionCompactionPrep();
long lastCompactSize = 0;
long maxSize = -1;
boolean completed = false;
try {
for (Store store: stores.values()) {
final Store.StoreSize ss = store.compact(majorCompaction);
lastCompactSize += store.getLastCompactSize();
if (ss != null && ss.getSize() > maxSize) {
maxSize = ss.getSize();
splitRow = ss.getSplitRow();
}
}
completed = true;
} catch (InterruptedIOException iioe) {
LOG.info("compaction interrupted by user: ", iioe);
} finally {
long now = EnvironmentEdgeManager.currentTimeMillis();
LOG.info(((completed) ? "completed" : "aborted")
+ " compaction on region " + this
+ " after " + StringUtils.formatTimeDiff(now, startTime));
if (completed) {
this.lastCompactInfo =
new Pair<Long,Long>((now - startTime) / 1000, lastCompactSize);
}
}
} finally {
synchronized (writestate) {
writestate.compacting = false;
writestate.notifyAll();
}
}
return splitRow;
} finally {
lock.readLock().unlock();
}
}
/**
* Flush the cache.
*
* When this method is called the cache will be flushed unless:
* <ol>
* <li>the cache is empty</li>
* <li>the region is closed.</li>
* <li>a flush is already in progress</li>
* <li>writes are disabled</li>
* </ol>
*
* <p>This method may block for some time, so it should not be called from a
* time-sensitive thread.
*
* @return true if cache was flushed
*
* @throws IOException general io exceptions
* @throws DroppedSnapshotException Thrown when replay of hlog is required
* because a Snapshot was not properly persisted.
*/
public boolean flushcache() throws IOException {
// fail-fast instead of waiting on the lock
if (this.closing.get()) {
LOG.debug("Skipping flush on " + this + " because closing");
return false;
}
lock.readLock().lock();
try {
if (this.closed.get()) {
LOG.debug("Skipping flush on " + this + " because closed");
return false;
}
try {
synchronized (writestate) {
if (!writestate.flushing && writestate.writesEnabled) {
this.writestate.flushing = true;
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("NOT flushing memstore for region " + this +
", flushing=" +
writestate.flushing + ", writesEnabled=" +
writestate.writesEnabled);
}
return false;
}
}
return internalFlushcache();
} finally {
synchronized (writestate) {
writestate.flushing = false;
this.writestate.flushRequested = false;
writestate.notifyAll();
}
}
} finally {
lock.readLock().unlock();
}
}
/**
* Flush the memstore.
*
* Flushing the memstore is a little tricky. We have a lot of updates in the
* memstore, all of which have also been written to the log. We need to
* write those updates in the memstore out to disk, while being able to
* process reads/writes as much as possible during the flush operation. Also,
* the log has to state clearly the point in time at which the memstore was
* flushed. (That way, during recovery, we know when we can rely on the
* on-disk flushed structures and when we have to recover the memstore from
* the log.)
*
* <p>So, we have a three-step process:
*
* <ul><li>A. Flush the memstore to the on-disk stores, noting the current
* sequence ID for the log.<li>
*
* <li>B. Write a FLUSHCACHE-COMPLETE message to the log, using the sequence
* ID that was current at the time of memstore-flush.</li>
*
* <li>C. Get rid of the memstore structures that are now redundant, as
* they've been flushed to the on-disk HStores.</li>
* </ul>
* <p>This method is protected, but can be accessed via several public
* routes.
*
* <p> This method may block for some time.
*
* @return true if the region needs compacting
*
* @throws IOException general io exceptions
* @throws DroppedSnapshotException Thrown when replay of hlog is required
* because a Snapshot was not properly persisted.
*/
protected boolean internalFlushcache() throws IOException {
return internalFlushcache(this.log, -1);
}
/**
* @param wal Null if we're NOT to go via hlog/wal.
* @param myseqid The seqid to use if <code>wal</code> is null writing out
* flush file.
* @return true if the region needs compacting
* @throws IOException
* @see #internalFlushcache()
*/
protected boolean internalFlushcache(final HLog wal, final long myseqid)
throws IOException {
final long startTime = EnvironmentEdgeManager.currentTimeMillis();
// Clear flush flag.
// Record latest flush time
this.lastFlushTime = startTime;
// If nothing to flush, return and avoid logging start/stop flush.
if (this.memstoreSize.get() <= 0) {
return false;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Started memstore flush for " + this +
", current region memstore size " +
StringUtils.humanReadableInt(this.memstoreSize.get()) +
((wal != null)? "": "; wal is null, using passed sequenceid=" + myseqid));
}
// Stop updates while we snapshot the memstore of all stores. We only have
// to do this for a moment. Its quick. The subsequent sequence id that
// goes into the HLog after we've flushed all these snapshots also goes
// into the info file that sits beside the flushed files.
// We also set the memstore size to zero here before we allow updates
// again so its value will represent the size of the updates received
// during the flush
long sequenceId = -1L;
long completeSequenceId = -1L;
// We have to take a write lock during snapshot, or else a write could
// end up in both snapshot and memstore (makes it difficult to do atomic
// rows then)
this.updatesLock.writeLock().lock();
final long currentMemStoreSize = this.memstoreSize.get();
List<StoreFlusher> storeFlushers = new ArrayList<StoreFlusher>(stores.size());
try {
sequenceId = (wal == null)? myseqid: wal.startCacheFlush();
completeSequenceId = this.getCompleteCacheFlushSequenceId(sequenceId);
for (Store s : stores.values()) {
storeFlushers.add(s.getStoreFlusher(completeSequenceId));
}
// prepare flush (take a snapshot)
for (StoreFlusher flusher : storeFlushers) {
flusher.prepare();
}
} finally {
this.updatesLock.writeLock().unlock();
}
LOG.debug("Finished snapshotting, commencing flushing stores");
// Any failure from here on out will be catastrophic requiring server
// restart so hlog content can be replayed and put back into the memstore.
// Otherwise, the snapshot content while backed up in the hlog, it will not
// be part of the current running servers state.
boolean compactionRequested = false;
try {
// A. Flush memstore to all the HStores.
// Keep running vector of all store files that includes both old and the
// just-made new flush store file.
for (StoreFlusher flusher : storeFlushers) {
flusher.flushCache();
}
// Switch snapshot (in memstore) -> new hfile (thus causing
// all the store scanners to reset/reseek).
for (StoreFlusher flusher : storeFlushers) {
boolean needsCompaction = flusher.commit();
if (needsCompaction) {
compactionRequested = true;
}
}
storeFlushers.clear();
// Set down the memstore size by amount of flush.
this.memstoreSize.addAndGet(-currentMemStoreSize);
} catch (Throwable t) {
// An exception here means that the snapshot was not persisted.
// The hlog needs to be replayed so its content is restored to memstore.
// Currently, only a server restart will do this.
// We used to only catch IOEs but its possible that we'd get other
// exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
// all and sundry.
if (wal != null) wal.abortCacheFlush();
DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
Bytes.toStringBinary(getRegionName()));
dse.initCause(t);
throw dse;
}
// If we get to here, the HStores have been written. If we get an
// error in completeCacheFlush it will release the lock it is holding
// B. Write a FLUSHCACHE-COMPLETE message to the log.
// This tells future readers that the HStores were emitted correctly,
// and that all updates to the log for this regionName that have lower
// log-sequence-ids can be safely ignored.
if (wal != null) {
wal.completeCacheFlush(this.regionInfo.getEncodedNameAsBytes(),
regionInfo.getTableDesc().getName(), completeSequenceId,
this.getRegionInfo().isMetaRegion());
}
// C. Finally notify anyone waiting on memstore to clear:
// e.g. checkResources().
synchronized (this) {
notifyAll(); // FindBugs NN_NAKED_NOTIFY
}
long time = EnvironmentEdgeManager.currentTimeMillis() - startTime;
if (LOG.isDebugEnabled()) {
LOG.info("Finished memstore flush of ~" +
StringUtils.humanReadableInt(currentMemStoreSize) + " for region " +
this + " in " + time + "ms, sequenceid=" + sequenceId +
", compaction requested=" + compactionRequested +
((wal == null)? "; wal=null": ""));
}
this.recentFlushes.add(new Pair<Long,Long>(time/1000,currentMemStoreSize));
return compactionRequested;
}
/**
* Get the sequence number to be associated with this cache flush. Used by
* TransactionalRegion to not complete pending transactions.
*
*
* @param currentSequenceId
* @return sequence id to complete the cache flush with
*/
protected long getCompleteCacheFlushSequenceId(long currentSequenceId) {
return currentSequenceId;
}
//////////////////////////////////////////////////////////////////////////////
// get() methods for client use.
//////////////////////////////////////////////////////////////////////////////
/**
* Return all the data for the row that matches <i>row</i> exactly,
* or the one that immediately preceeds it, at or immediately before
* <i>ts</i>.
*
* @param row row key
* @return map of values
* @throws IOException
*/
Result getClosestRowBefore(final byte [] row)
throws IOException{
return getClosestRowBefore(row, HConstants.CATALOG_FAMILY);
}
/**
* Return all the data for the row that matches <i>row</i> exactly,
* or the one that immediately preceeds it, at or immediately before
* <i>ts</i>.
*
* @param row row key
* @param family column family to find on
* @return map of values
* @throws IOException read exceptions
*/
public Result getClosestRowBefore(final byte [] row, final byte [] family)
throws IOException {
// look across all the HStores for this region and determine what the
// closest key is across all column families, since the data may be sparse
KeyValue key = null;
checkRow(row);
startRegionOperation();
try {
Store store = getStore(family);
KeyValue kv = new KeyValue(row, HConstants.LATEST_TIMESTAMP);
// get the closest key. (HStore.getRowKeyAtOrBefore can return null)
key = store.getRowKeyAtOrBefore(kv);
if (key == null) {
return null;
}
Get get = new Get(key.getRow());
get.addFamily(family);
return get(get, null);
} finally {
closeRegionOperation();
}
}
/**
* Return an iterator that scans over the HRegion, returning the indicated
* columns and rows specified by the {@link Scan}.
* <p>
* This Iterator must be closed by the caller.
*
* @param scan configured {@link Scan}
* @return InternalScanner
* @throws IOException read exceptions
*/
public InternalScanner getScanner(Scan scan)
throws IOException {
return getScanner(scan, null);
}
protected InternalScanner getScanner(Scan scan, List<KeyValueScanner> additionalScanners) throws IOException {
startRegionOperation();
try {
// Verify families are all valid
if(scan.hasFamilies()) {
for(byte [] family : scan.getFamilyMap().keySet()) {
checkFamily(family);
}
} else { // Adding all families to scanner
for(byte[] family: regionInfo.getTableDesc().getFamiliesKeys()){
scan.addFamily(family);
}
}
return instantiateInternalScanner(scan, additionalScanners);
} finally {
closeRegionOperation();
}
}
protected InternalScanner instantiateInternalScanner(Scan scan, List<KeyValueScanner> additionalScanners) throws IOException {
return new RegionScanner(scan, additionalScanners);
}
/*
* @param delete The passed delete is modified by this method. WARNING!
*/
private void prepareDelete(Delete delete) throws IOException {
// Check to see if this is a deleteRow insert
if(delete.getFamilyMap().isEmpty()){
for(byte [] family : regionInfo.getTableDesc().getFamiliesKeys()){
// Don't eat the timestamp
delete.deleteFamily(family, delete.getTimeStamp());
}
} else {
for(byte [] family : delete.getFamilyMap().keySet()) {
if(family == null) {
throw new NoSuchColumnFamilyException("Empty family is invalid");
}
checkFamily(family);
}
}
}
//////////////////////////////////////////////////////////////////////////////
// set() methods for client use.
//////////////////////////////////////////////////////////////////////////////
/**
* @param delete delete object
* @param lockid existing lock id, or null for grab a lock
* @param writeToWAL append to the write ahead lock or not
* @throws IOException read exceptions
*/
public void delete(Delete delete, Integer lockid, boolean writeToWAL)
throws IOException {
checkReadOnly();
checkResources();
Integer lid = null;
startRegionOperation();
try {
byte [] row = delete.getRow();
// If we did not pass an existing row lock, obtain a new one
lid = getLock(lockid, row, true);
// All edits for the given row (across all column families) must happen atomically.
prepareDelete(delete);
delete(delete.getFamilyMap(), writeToWAL);
} finally {
if(lockid == null) releaseRowLock(lid);
closeRegionOperation();
}
}
/**
* @param familyMap map of family to edits for the given family.
* @param writeToWAL
* @throws IOException
*/
public void delete(Map<byte[], List<KeyValue>> familyMap, boolean writeToWAL)
throws IOException {
long now = EnvironmentEdgeManager.currentTimeMillis();
byte [] byteNow = Bytes.toBytes(now);
boolean flush = false;
updatesLock.readLock().lock();
try {
for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) {
byte[] family = e.getKey();
List<KeyValue> kvs = e.getValue();
Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
for (KeyValue kv: kvs) {
// Check if time is LATEST, change to time of most recent addition if so
// This is expensive.
if (kv.isLatestTimestamp() && kv.isDeleteType()) {
byte[] qual = kv.getQualifier();
if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
Integer count = kvCount.get(qual);
if (count == null) {
kvCount.put(qual, 1);
} else {
kvCount.put(qual, count + 1);
}
count = kvCount.get(qual);
Get get = new Get(kv.getRow());
get.setMaxVersions(count);
get.addColumn(family, qual);
List<KeyValue> result = get(get);
if (result.size() < count) {
// Nothing to delete
kv.updateLatestStamp(byteNow);
continue;
}
if (result.size() > count) {
throw new RuntimeException("Unexpected size: " + result.size());
}
KeyValue getkv = result.get(count - 1);
Bytes.putBytes(kv.getBuffer(), kv.getTimestampOffset(),
getkv.getBuffer(), getkv.getTimestampOffset(), Bytes.SIZEOF_LONG);
} else {
kv.updateLatestStamp(byteNow);
}
}
}
if (writeToWAL) {
// write/sync to WAL should happen before we touch memstore.
//
// If order is reversed, i.e. we write to memstore first, and
// for some reason fail to write/sync to commit log, the memstore
// will contain uncommitted transactions.
//
// bunch up all edits across all column families into a
// single WALEdit.
WALEdit walEdit = new WALEdit();
addFamilyMapToWALEdit(familyMap, walEdit);
this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
walEdit, now);
}
// Now make changes to the memstore.
long addedSize = applyFamilyMapToMemstore(familyMap);
flush = isFlushSize(memstoreSize.addAndGet(addedSize));
} finally {
this.updatesLock.readLock().unlock();
}
if (flush) {
// Request a cache flush. Do it outside update lock.
requestFlush();
}
}
/**
* @param put
* @throws IOException
*/
public void put(Put put) throws IOException {
this.put(put, null, put.getWriteToWAL());
}
/**
* @param put
* @param writeToWAL
* @throws IOException
*/
public void put(Put put, boolean writeToWAL) throws IOException {
this.put(put, null, writeToWAL);
}
/**
* @param put
* @param lockid
* @throws IOException
*/
public void put(Put put, Integer lockid) throws IOException {
this.put(put, lockid, put.getWriteToWAL());
}
/**
* @param put
* @param lockid
* @param writeToWAL
* @throws IOException
*/
public void put(Put put, Integer lockid, boolean writeToWAL)
throws IOException {
checkReadOnly();
// Do a rough check that we have resources to accept a write. The check is
// 'rough' in that between the resource check and the call to obtain a
// read lock, resources may run out. For now, the thought is that this
// will be extremely rare; we'll deal with it when it happens.
checkResources();
startRegionOperation();
try {
// We obtain a per-row lock, so other clients will block while one client
// performs an update. The read lock is released by the client calling
// #commit or #abort or if the HRegionServer lease on the lock expires.
// See HRegionServer#RegionListener for how the expire on HRegionServer
// invokes a HRegion#abort.
byte [] row = put.getRow();
// If we did not pass an existing row lock, obtain a new one
Integer lid = getLock(lockid, row, true);
try {
// All edits for the given row (across all column families) must happen atomically.
put(put.getFamilyMap(), writeToWAL);
} finally {
if(lockid == null) releaseRowLock(lid);
}
} finally {
closeRegionOperation();
}
}
/**
* Struct-like class that tracks the progress of a batch operation,
* accumulating status codes and tracking the index at which processing
* is proceeding.
*/
private static class BatchOperationInProgress<T> {
T[] operations;
OperationStatusCode[] retCodes;
int nextIndexToProcess = 0;
public BatchOperationInProgress(T[] operations) {
this.operations = operations;
retCodes = new OperationStatusCode[operations.length];
Arrays.fill(retCodes, OperationStatusCode.NOT_RUN);
}
public boolean isDone() {
return nextIndexToProcess == operations.length;
}
}
/**
* Perform a batch put with no pre-specified locks
* @see HRegion#put(Pair[])
*/
public OperationStatusCode[] put(Put[] puts) throws IOException {
@SuppressWarnings("unchecked")
Pair<Put, Integer> putsAndLocks[] = new Pair[puts.length];
for (int i = 0; i < puts.length; i++) {
putsAndLocks[i] = new Pair<Put, Integer>(puts[i], null);
}
return put(putsAndLocks);
}
/**
* Perform a batch of puts.
* @param putsAndLocks the list of puts paired with their requested lock IDs.
* @throws IOException
*/
public OperationStatusCode[] put(Pair<Put, Integer>[] putsAndLocks) throws IOException {
BatchOperationInProgress<Pair<Put, Integer>> batchOp =
new BatchOperationInProgress<Pair<Put,Integer>>(putsAndLocks);
while (!batchOp.isDone()) {
checkReadOnly();
checkResources();
long newSize;
startRegionOperation();
try {
long addedSize = doMiniBatchPut(batchOp);
newSize = memstoreSize.addAndGet(addedSize);
} finally {
closeRegionOperation();
}
if (isFlushSize(newSize)) {
requestFlush();
}
}
return batchOp.retCodes;
}
private long doMiniBatchPut(BatchOperationInProgress<Pair<Put, Integer>> batchOp) throws IOException {
long now = EnvironmentEdgeManager.currentTimeMillis();
byte[] byteNow = Bytes.toBytes(now);
boolean locked = false;
/** Keep track of the locks we hold so we can release them in finally clause */
List<Integer> acquiredLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
// We try to set up a batch in the range [firstIndex,lastIndexExclusive)
int firstIndex = batchOp.nextIndexToProcess;
int lastIndexExclusive = firstIndex;
boolean success = false;
try {
// ------------------------------------
// STEP 1. Try to acquire as many locks as we can, and ensure
// we acquire at least one.
// ----------------------------------
int numReadyToWrite = 0;
while (lastIndexExclusive < batchOp.operations.length) {
Pair<Put, Integer> nextPair = batchOp.operations[lastIndexExclusive];
Put put = nextPair.getFirst();
Integer providedLockId = nextPair.getSecond();
// Check the families in the put. If bad, skip this one.
try {
checkFamilies(put.getFamilyMap().keySet());
} catch (NoSuchColumnFamilyException nscf) {
LOG.warn("No such column family in batch put", nscf);
batchOp.retCodes[lastIndexExclusive] = OperationStatusCode.BAD_FAMILY;
lastIndexExclusive++;
continue;
}
// If we haven't got any rows in our batch, we should block to
// get the next one.
boolean shouldBlock = numReadyToWrite == 0;
Integer acquiredLockId = getLock(providedLockId, put.getRow(), shouldBlock);
if (acquiredLockId == null) {
// We failed to grab another lock
assert !shouldBlock : "Should never fail to get lock when blocking";
break; // stop acquiring more rows for this batch
}
if (providedLockId == null) {
acquiredLocks.add(acquiredLockId);
}
lastIndexExclusive++;
numReadyToWrite++;
}
// Nothing to put -- an exception in the above such as NoSuchColumnFamily?
if (numReadyToWrite <= 0) return 0L;
// We've now grabbed as many puts off the list as we can
// ------------------------------------
// STEP 2. Update any LATEST_TIMESTAMP timestamps
// ----------------------------------
for (int i = firstIndex; i < lastIndexExclusive; i++) {
updateKVTimestamps(
batchOp.operations[i].getFirst().getFamilyMap().values(),
byteNow);
}
this.updatesLock.readLock().lock();
locked = true;
// ------------------------------------
// STEP 3. Write to WAL
// ----------------------------------
WALEdit walEdit = new WALEdit();
for (int i = firstIndex; i < lastIndexExclusive; i++) {
// Skip puts that were determined to be invalid during preprocessing
if (batchOp.retCodes[i] != OperationStatusCode.NOT_RUN) continue;
Put p = batchOp.operations[i].getFirst();
if (!p.getWriteToWAL()) continue;
addFamilyMapToWALEdit(p.getFamilyMap(), walEdit);
}
// Append the edit to WAL
this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
walEdit, now);
// ------------------------------------
// STEP 4. Write back to memstore
// ----------------------------------
long addedSize = 0;
for (int i = firstIndex; i < lastIndexExclusive; i++) {
if (batchOp.retCodes[i] != OperationStatusCode.NOT_RUN) continue;
Put p = batchOp.operations[i].getFirst();
addedSize += applyFamilyMapToMemstore(p.getFamilyMap());
batchOp.retCodes[i] = OperationStatusCode.SUCCESS;
}
success = true;
return addedSize;
} finally {
if (locked)
this.updatesLock.readLock().unlock();
for (Integer toRelease : acquiredLocks) {
releaseRowLock(toRelease);
}
if (!success) {
for (int i = firstIndex; i < lastIndexExclusive; i++) {
if (batchOp.retCodes[i] == OperationStatusCode.NOT_RUN) {
batchOp.retCodes[i] = OperationStatusCode.FAILURE;
}
}
}
batchOp.nextIndexToProcess = lastIndexExclusive;
}
}
//TODO, Think that gets/puts and deletes should be refactored a bit so that
//the getting of the lock happens before, so that you would just pass it into
//the methods. So in the case of checkAndMutate you could just do lockRow,
//get, put, unlockRow or something
/**
*
* @param row
* @param family
* @param qualifier
* @param expectedValue
* @param lockId
* @param writeToWAL
* @throws IOException
* @return true if the new put was execute, false otherwise
*/
public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
byte [] expectedValue, Writable w, Integer lockId, boolean writeToWAL)
throws IOException{
checkReadOnly();
//TODO, add check for value length or maybe even better move this to the
//client if this becomes a global setting
checkResources();
boolean isPut = w instanceof Put;
if (!isPut && !(w instanceof Delete))
throw new DoNotRetryIOException("Action must be Put or Delete");
Row r = (Row)w;
if (Bytes.compareTo(row, r.getRow()) != 0) {
throw new DoNotRetryIOException("Action's getRow must match the passed row");
}
startRegionOperation();
try {
RowLock lock = isPut ? ((Put)w).getRowLock() : ((Delete)w).getRowLock();
Get get = new Get(row, lock);
checkFamily(family);
get.addColumn(family, qualifier);
// Lock row
Integer lid = getLock(lockId, get.getRow(), true);
List<KeyValue> result = new ArrayList<KeyValue>();
try {
result = get(get);
boolean matches = false;
if (result.size() == 0 &&
(expectedValue == null || expectedValue.length == 0)) {
matches = true;
} else if (result.size() == 1) {
//Compare the expected value with the actual value
byte [] actualValue = result.get(0).getValue();
matches = Bytes.equals(expectedValue, actualValue);
}
//If matches put the new put or delete the new delete
if (matches) {
// All edits for the given row (across all column families) must happen atomically.
if (isPut) {
put(((Put)w).getFamilyMap(), writeToWAL);
} else {
Delete d = (Delete)w;
prepareDelete(d);
delete(d.getFamilyMap(), writeToWAL);
}
return true;
}
return false;
} finally {
if(lockId == null) releaseRowLock(lid);
}
} finally {
closeRegionOperation();
}
}
/**
* Replaces any KV timestamps set to {@link HConstants#LATEST_TIMESTAMP}
* with the provided current timestamp.
*/
private void updateKVTimestamps(
final Iterable<List<KeyValue>> keyLists, final byte[] now) {
for (List<KeyValue> keys: keyLists) {
if (keys == null) continue;
for (KeyValue key : keys) {
key.updateLatestStamp(now);
}
}
}
/*
* Check if resources to support an update.
*
* Here we synchronize on HRegion, a broad scoped lock. Its appropriate
* given we're figuring in here whether this region is able to take on
* writes. This is only method with a synchronize (at time of writing),
* this and the synchronize on 'this' inside in internalFlushCache to send
* the notify.
*/
private void checkResources() {
// If catalog region, do not impose resource constraints or block updates.
if (this.getRegionInfo().isMetaRegion()) return;
boolean blocked = false;
while (this.memstoreSize.get() > this.blockingMemStoreSize) {
requestFlush();
if (!blocked) {
LOG.info("Blocking updates for '" + Thread.currentThread().getName() +
"' on region " + Bytes.toStringBinary(getRegionName()) +
": memstore size " +
StringUtils.humanReadableInt(this.memstoreSize.get()) +
" is >= than blocking " +
StringUtils.humanReadableInt(this.blockingMemStoreSize) + " size");
}
blocked = true;
synchronized(this) {
try {
wait(threadWakeFrequency);
} catch (InterruptedException e) {
// continue;
}
}
}
if (blocked) {
LOG.info("Unblocking updates for region " + this + " '"
+ Thread.currentThread().getName() + "'");
}
}
/**
* @throws IOException Throws exception if region is in read-only mode.
*/
protected void checkReadOnly() throws IOException {
if (this.writestate.isReadOnly()) {
throw new IOException("region is read only");
}
}
/**
* Add updates first to the hlog and then add values to memstore.
* Warning: Assumption is caller has lock on passed in row.
* @param family
* @param edits Cell updates by column
* @praram now
* @throws IOException
*/
private void put(final byte [] family, final List<KeyValue> edits)
throws IOException {
Map<byte[], List<KeyValue>> familyMap = new HashMap<byte[], List<KeyValue>>();
familyMap.put(family, edits);
this.put(familyMap, true);
}
/**
* Add updates first to the hlog (if writeToWal) and then add values to memstore.
* Warning: Assumption is caller has lock on passed in row.
* @param familyMap map of family to edits for the given family.
* @param writeToWAL if true, then we should write to the log
* @throws IOException
*/
private void put(final Map<byte [], List<KeyValue>> familyMap,
boolean writeToWAL) throws IOException {
long now = EnvironmentEdgeManager.currentTimeMillis();
byte[] byteNow = Bytes.toBytes(now);
boolean flush = false;
this.updatesLock.readLock().lock();
try {
checkFamilies(familyMap.keySet());
updateKVTimestamps(familyMap.values(), byteNow);
// write/sync to WAL should happen before we touch memstore.
//
// If order is reversed, i.e. we write to memstore first, and
// for some reason fail to write/sync to commit log, the memstore
// will contain uncommitted transactions.
if (writeToWAL) {
WALEdit walEdit = new WALEdit();
addFamilyMapToWALEdit(familyMap, walEdit);
this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
walEdit, now);
}
long addedSize = applyFamilyMapToMemstore(familyMap);
flush = isFlushSize(memstoreSize.addAndGet(addedSize));
} finally {
this.updatesLock.readLock().unlock();
}
if (flush) {
// Request a cache flush. Do it outside update lock.
requestFlush();
}
}
/**
* Atomically apply the given map of family->edits to the memstore.
* This handles the consistency control on its own, but the caller
* should already have locked updatesLock.readLock(). This also does
* <b>not</b> check the families for validity.
*
* @return the additional memory usage of the memstore caused by the
* new entries.
*/
private long applyFamilyMapToMemstore(Map<byte[], List<KeyValue>> familyMap) {
ReadWriteConsistencyControl.WriteEntry w = null;
long size = 0;
try {
w = rwcc.beginMemstoreInsert();
for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) {
byte[] family = e.getKey();
List<KeyValue> edits = e.getValue();
Store store = getStore(family);
for (KeyValue kv: edits) {
kv.setMemstoreTS(w.getWriteNumber());
size += store.add(kv);
}
}
} finally {
rwcc.completeMemstoreInsert(w);
}
return size;
}
/**
* Check the collection of families for validity.
* @throws NoSuchColumnFamilyException if a family does not exist.
*/
private void checkFamilies(Collection<byte[]> families)
throws NoSuchColumnFamilyException {
for (byte[] family : families) {
checkFamily(family);
}
}
/**
* Append the given map of family->edits to a WALEdit data structure.
* This does not write to the HLog itself.
* @param familyMap map of family->edits
* @param walEdit the destination entry to append into
*/
private void addFamilyMapToWALEdit(Map<byte[], List<KeyValue>> familyMap,
WALEdit walEdit) {
for (List<KeyValue> edits : familyMap.values()) {
for (KeyValue kv : edits) {
walEdit.add(kv);
}
}
}
private void requestFlush() {
if (this.flushRequester == null) {
return;
}
synchronized (writestate) {
if (this.writestate.isFlushRequested()) {
return;
}
writestate.flushRequested = true;
}
// Make request outside of synchronize block; HBASE-818.
this.flushRequester.requestFlush(this);
if (LOG.isDebugEnabled()) {
LOG.debug("Flush requested on " + this);
}
}
/*
* @param size
* @return True if size is over the flush threshold
*/
private boolean isFlushSize(final long size) {
return size > this.memstoreFlushSize;
}
/**
* Read the edits log put under this region by wal log splitting process. Put
* the recovered edits back up into this region.
*
* <p>We can ignore any log message that has a sequence ID that's equal to or
* lower than minSeqId. (Because we know such log messages are already
* reflected in the HFiles.)
*
* <p>While this is running we are putting pressure on memory yet we are
* outside of our usual accounting because we are not yet an onlined region
* (this stuff is being run as part of Region initialization). This means
* that if we're up against global memory limits, we'll not be flagged to flush
* because we are not online. We can't be flushed by usual mechanisms anyways;
* we're not yet online so our relative sequenceids are not yet aligned with
* HLog sequenceids -- not till we come up online, post processing of split
* edits.
*
* <p>But to help relieve memory pressure, at least manage our own heap size
* flushing if are in excess of per-region limits. Flushing, though, we have
* to be careful and avoid using the regionserver/hlog sequenceid. Its running
* on a different line to whats going on in here in this region context so if we
* crashed replaying these edits, but in the midst had a flush that used the
* regionserver log with a sequenceid in excess of whats going on in here
* in this region and with its split editlogs, then we could miss edits the
* next time we go to recover. So, we have to flush inline, using seqids that
* make sense in a this single region context only -- until we online.
*
* @param regiondir
* @param minSeqId Any edit found in split editlogs needs to be in excess of
* this minSeqId to be applied, else its skipped.
* @param reporter
* @return the sequence id of the last edit added to this region out of the
* recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
* @throws UnsupportedEncodingException
* @throws IOException
*/
protected long replayRecoveredEditsIfAny(final Path regiondir,
final long minSeqId, final CancelableProgressable reporter)
throws UnsupportedEncodingException, IOException {
long seqid = minSeqId;
NavigableSet<Path> files = HLog.getSplitEditFilesSorted(this.fs, regiondir);
if (files == null || files.isEmpty()) return seqid;
for (Path edits: files) {
if (edits == null || !this.fs.exists(edits)) {
LOG.warn("Null or non-existent edits file: " + edits);
continue;
}
if (isZeroLengthThenDelete(this.fs, edits)) continue;
try {
seqid = replayRecoveredEdits(edits, seqid, reporter);
} catch (IOException e) {
boolean skipErrors = conf.getBoolean("hbase.skip.errors", false);
if (skipErrors) {
Path p = HLog.moveAsideBadEditsFile(fs, edits);
LOG.error("hbase.skip.errors=true so continuing. Renamed " + edits +
" as " + p, e);
} else {
throw e;
}
}
}
if (seqid > minSeqId) {
// Then we added some edits to memory. Flush and cleanup split edit files.
internalFlushcache(null, seqid);
}
// Now delete the content of recovered edits. We're done w/ them.
for (Path file: files) {
if (!this.fs.delete(file, false)) {
LOG.error("Failed delete of " + file);
} else {
LOG.debug("Deleted recovered.edits file=" + file);
}
}
return seqid;
}
/*
* @param edits File of recovered edits.
* @param minSeqId Minimum sequenceid found in a store file. Edits in log
* must be larger than this to be replayed.
* @param reporter
* @return the sequence id of the last edit added to this region out of the
* recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
* @throws IOException
*/
private long replayRecoveredEdits(final Path edits,
final long minSeqId, final CancelableProgressable reporter)
throws IOException {
LOG.info("Replaying edits from " + edits + "; minSequenceid=" + minSeqId);
HLog.Reader reader = HLog.getReader(this.fs, edits, conf);
try {
long currentEditSeqId = minSeqId;
long firstSeqIdInLog = -1;
long skippedEdits = 0;
long editsCount = 0;
long intervalEdits = 0;
HLog.Entry entry;
Store store = null;
try {
// How many edits seen before we check elapsed time
int interval = this.conf.getInt("hbase.hstore.report.interval.edits",
2000);
// How often to send a progress report (default 1/2 master timeout)
int period = this.conf.getInt("hbase.hstore.report.period",
this.conf.getInt("hbase.master.assignment.timeoutmonitor.timeout",
30000) / 2);
long lastReport = EnvironmentEdgeManager.currentTimeMillis();
while ((entry = reader.next()) != null) {
HLogKey key = entry.getKey();
WALEdit val = entry.getEdit();
if (reporter != null) {
intervalEdits += val.size();
if (intervalEdits >= interval) {
// Number of edits interval reached
intervalEdits = 0;
long cur = EnvironmentEdgeManager.currentTimeMillis();
if (lastReport + period <= cur) {
// Timeout reached
if(!reporter.progress()) {
String msg = "Progressable reporter failed, stopping replay";
LOG.warn(msg);
throw new IOException(msg);
}
lastReport = cur;
}
}
}
if (firstSeqIdInLog == -1) {
firstSeqIdInLog = key.getLogSeqNum();
}
// Now, figure if we should skip this edit.
if (key.getLogSeqNum() <= currentEditSeqId) {
skippedEdits++;
continue;
}
currentEditSeqId = key.getLogSeqNum();
boolean flush = false;
for (KeyValue kv: val.getKeyValues()) {
// Check this edit is for me. Also, guard against writing the special
// METACOLUMN info such as HBASE::CACHEFLUSH entries
if (kv.matchingFamily(HLog.METAFAMILY) ||
!Bytes.equals(key.getEncodedRegionName(), this.regionInfo.getEncodedNameAsBytes())) {
skippedEdits++;
continue;
}
// Figure which store the edit is meant for.
if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
store = this.stores.get(kv.getFamily());
}
if (store == null) {
// This should never happen. Perhaps schema was changed between
// crash and redeploy?
LOG.warn("No family for " + kv);
skippedEdits++;
continue;
}
// Once we are over the limit, restoreEdit will keep returning true to
// flush -- but don't flush until we've played all the kvs that make up
// the WALEdit.
flush = restoreEdit(store, kv);
editsCount++;
}
if (flush) internalFlushcache(null, currentEditSeqId);
}
} catch (EOFException eof) {
Path p = HLog.moveAsideBadEditsFile(fs, edits);
LOG.warn("Encountered EOF. Most likely due to Master failure during " +
"log spliting, so we have this data in another edit. " +
"Continuing, but renaming " + edits + " as " + p, eof);
} catch (IOException ioe) {
// If the IOE resulted from bad file format,
// then this problem is idempotent and retrying won't help
if (ioe.getCause() instanceof ParseException) {
Path p = HLog.moveAsideBadEditsFile(fs, edits);
LOG.warn("File corruption encountered! " +
"Continuing, but renaming " + edits + " as " + p, ioe);
} else {
// other IO errors may be transient (bad network connection,
// checksum exception on one datanode, etc). throw & retry
throw ioe;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits +
", firstSequenceidInLog=" + firstSeqIdInLog +
", maxSequenceidInLog=" + currentEditSeqId);
}
return currentEditSeqId;
} finally {
reader.close();
}
}
/**
* Used by tests
* @param s Store to add edit too.
* @param kv KeyValue to add.
* @return True if we should flush.
*/
protected boolean restoreEdit(final Store s, final KeyValue kv) {
return isFlushSize(this.memstoreSize.addAndGet(s.add(kv)));
}
/*
* @param fs
* @param p File to check.
* @return True if file was zero-length (and if so, we'll delete it in here).
* @throws IOException
*/
private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
throws IOException {
FileStatus stat = fs.getFileStatus(p);
if (stat.getLen() > 0) return false;
LOG.warn("File " + p + " is zero-length, deleting.");
fs.delete(p, false);
return true;
}
protected Store instantiateHStore(Path tableDir, HColumnDescriptor c)
throws IOException {
return new Store(tableDir, this, c, this.fs, this.conf);
}
/**
* Return HStore instance.
* Use with caution. Exposed for use of fixup utilities.
* @param column Name of column family hosted by this region.
* @return Store that goes with the family on passed <code>column</code>.
* TODO: Make this lookup faster.
*/
public Store getStore(final byte [] column) {
return this.stores.get(column);
}
//////////////////////////////////////////////////////////////////////////////
// Support code
//////////////////////////////////////////////////////////////////////////////
/** Make sure this is a valid row for the HRegion */
private void checkRow(final byte [] row) throws IOException {
if(!rowIsInRange(regionInfo, row)) {
throw new WrongRegionException("Requested row out of range for " +
"HRegion " + this + ", startKey='" +
Bytes.toStringBinary(regionInfo.getStartKey()) + "', getEndKey()='" +
Bytes.toStringBinary(regionInfo.getEndKey()) + "', row='" +
Bytes.toStringBinary(row) + "'");
}
}
/**
* Obtain a lock on the given row. Blocks until success.
*
* I know it's strange to have two mappings:
* <pre>
* ROWS ==> LOCKS
* </pre>
* as well as
* <pre>
* LOCKS ==> ROWS
* </pre>
*
* But it acts as a guard on the client; a miswritten client just can't
* submit the name of a row and start writing to it; it must know the correct
* lockid, which matches the lock list in memory.
*
* <p>It would be more memory-efficient to assume a correctly-written client,
* which maybe we'll do in the future.
*
* @param row Name of row to lock.
* @throws IOException
* @return The id of the held lock.
*/
public Integer obtainRowLock(final byte [] row) throws IOException {
startRegionOperation();
try {
return internalObtainRowLock(row, true);
} finally {
closeRegionOperation();
}
}
/**
* Tries to obtain a row lock on the given row, but does not block if the
* row lock is not available. If the lock is not available, returns false.
* Otherwise behaves the same as the above method.
* @see HRegion#obtainRowLock(byte[])
*/
public Integer tryObtainRowLock(final byte[] row) throws IOException {
startRegionOperation();
try {
return internalObtainRowLock(row, false);
} finally {
closeRegionOperation();
}
}
/**
* Obtains or tries to obtain the given row lock.
* @param waitForLock if true, will block until the lock is available.
* Otherwise, just tries to obtain the lock and returns
* null if unavailable.
*/
private Integer internalObtainRowLock(final byte[] row, boolean waitForLock)
throws IOException {
checkRow(row);
startRegionOperation();
try {
synchronized (lockedRows) {
while (lockedRows.contains(row)) {
if (!waitForLock) {
return null;
}
try {
lockedRows.wait();
} catch (InterruptedException ie) {
// Empty
}
}
// generate a new lockid. Attempt to insert the new [lockid, row].
// if this lockid already exists in the map then revert and retry
// We could have first done a lockIds.get, and if it does not exist only
// then do a lockIds.put, but the hope is that the lockIds.put will
// mostly return null the first time itself because there won't be
// too many lockId collisions.
byte [] prev = null;
Integer lockId = null;
do {
lockId = new Integer(lockIdGenerator++);
prev = lockIds.put(lockId, row);
if (prev != null) {
lockIds.put(lockId, prev); // revert old value
lockIdGenerator = rand.nextInt(); // generate new start point
}
} while (prev != null);
lockedRows.add(row);
lockedRows.notifyAll();
return lockId;
}
} finally {
closeRegionOperation();
}
}
/**
* Used by unit tests.
* @param lockid
* @return Row that goes with <code>lockid</code>
*/
byte [] getRowFromLock(final Integer lockid) {
synchronized (lockedRows) {
return lockIds.get(lockid);
}
}
/**
* Release the row lock!
* @param lockid The lock ID to release.
*/
void releaseRowLock(final Integer lockid) {
synchronized (lockedRows) {
byte[] row = lockIds.remove(lockid);
lockedRows.remove(row);
lockedRows.notifyAll();
}
}
/**
* See if row is currently locked.
* @param lockid
* @return boolean
*/
boolean isRowLocked(final Integer lockid) {
synchronized (lockedRows) {
if (lockIds.get(lockid) != null) {
return true;
}
return false;
}
}
/**
* Returns existing row lock if found, otherwise
* obtains a new row lock and returns it.
* @param lockid requested by the user, or null if the user didn't already hold lock
* @param row the row to lock
* @param waitForLock if true, will block until the lock is available, otherwise will
* simply return null if it could not acquire the lock.
* @return lockid or null if waitForLock is false and the lock was unavailable.
*/
private Integer getLock(Integer lockid, byte [] row, boolean waitForLock)
throws IOException {
Integer lid = null;
if (lockid == null) {
lid = internalObtainRowLock(row, waitForLock);
} else {
if (!isRowLocked(lockid)) {
throw new IOException("Invalid row lock");
}
lid = lockid;
}
return lid;
}
public void bulkLoadHFile(String hfilePath, byte[] familyName)
throws IOException {
startRegionOperation();
try {
Store store = getStore(familyName);
if (store == null) {
throw new DoNotRetryIOException(
"No such column family " + Bytes.toStringBinary(familyName));
}
store.bulkLoadHFile(hfilePath);
} finally {
closeRegionOperation();
}
}
@Override
public boolean equals(Object o) {
if (!(o instanceof HRegion)) {
return false;
}
return this.hashCode() == ((HRegion)o).hashCode();
}
@Override
public int hashCode() {
return Bytes.hashCode(this.regionInfo.getRegionName());
}
@Override
public String toString() {
return this.regionInfo.getRegionNameAsString();
}
/** @return Path of region base directory */
public Path getTableDir() {
return this.tableDir;
}
/**
* RegionScanner is an iterator through a bunch of rows in an HRegion.
* <p>
* It is used to combine scanners from multiple Stores (aka column families).
*/
class RegionScanner implements InternalScanner {
// Package local for testability
KeyValueHeap storeHeap = null;
private final byte [] stopRow;
private Filter filter;
private List<KeyValue> results = new ArrayList<KeyValue>();
private int batch;
private int isScan;
private boolean filterClosed = false;
private long readPt;
public HRegionInfo getRegionName() {
return regionInfo;
}
RegionScanner(Scan scan, List<KeyValueScanner> additionalScanners) throws IOException {
//DebugPrint.println("HRegionScanner.<init>");
this.filter = scan.getFilter();
this.batch = scan.getBatch();
if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW)) {
this.stopRow = null;
} else {
this.stopRow = scan.getStopRow();
}
// If we are doing a get, we want to be [startRow,endRow] normally
// it is [startRow,endRow) and if startRow=endRow we get nothing.
this.isScan = scan.isGetScan() ? -1 : 0;
this.readPt = ReadWriteConsistencyControl.resetThreadReadPoint(rwcc);
List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>();
if (additionalScanners != null) {
scanners.addAll(additionalScanners);
}
for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
scan.getFamilyMap().entrySet()) {
Store store = stores.get(entry.getKey());
scanners.add(store.getScanner(scan, entry.getValue()));
}
this.storeHeap = new KeyValueHeap(scanners, comparator);
}
RegionScanner(Scan scan) throws IOException {
this(scan, null);
}
/**
* Reset both the filter and the old filter.
*/
protected void resetFilters() {
if (filter != null) {
filter.reset();
}
}
public synchronized boolean next(List<KeyValue> outResults, int limit)
throws IOException {
if (this.filterClosed) {
throw new UnknownScannerException("Scanner was closed (timed out?) " +
"after we renewed it. Could be caused by a very slow scanner " +
"or a lengthy garbage collection");
}
startRegionOperation();
try {
// This could be a new thread from the last time we called next().
ReadWriteConsistencyControl.setThreadReadPoint(this.readPt);
results.clear();
boolean returnResult = nextInternal(limit);
outResults.addAll(results);
resetFilters();
if (isFilterDone()) {
return false;
}
return returnResult;
} finally {
closeRegionOperation();
}
}
public synchronized boolean next(List<KeyValue> outResults)
throws IOException {
// apply the batching limit by default
return next(outResults, batch);
}
/*
* @return True if a filter rules the scanner is over, done.
*/
synchronized boolean isFilterDone() {
return this.filter != null && this.filter.filterAllRemaining();
}
private boolean nextInternal(int limit) throws IOException {
while (true) {
byte [] currentRow = peekRow();
if (isStopRow(currentRow)) {
if (filter != null && filter.hasFilterRow()) {
filter.filterRow(results);
}
if (filter != null && filter.filterRow()) {
results.clear();
}
return false;
} else if (filterRowKey(currentRow)) {
nextRow(currentRow);
} else {
byte [] nextRow;
do {
this.storeHeap.next(results, limit - results.size());
if (limit > 0 && results.size() == limit) {
if (this.filter != null && filter.hasFilterRow()) throw new IncompatibleFilterException(
"Filter with filterRow(List<KeyValue>) incompatible with scan with limit!");
return true; // we are expecting more yes, but also limited to how many we can return.
}
} while (Bytes.equals(currentRow, nextRow = peekRow()));
final boolean stopRow = isStopRow(nextRow);
// now that we have an entire row, lets process with a filters:
// first filter with the filterRow(List)
if (filter != null && filter.hasFilterRow()) {
filter.filterRow(results);
}
if (results.isEmpty() || filterRow()) {
// this seems like a redundant step - we already consumed the row
// there're no left overs.
// the reasons for calling this method are:
// 1. reset the filters.
// 2. provide a hook to fast forward the row (used by subclasses)
nextRow(currentRow);
// This row was totally filtered out, if this is NOT the last row,
// we should continue on.
if (!stopRow) continue;
}
return !stopRow;
}
}
}
private boolean filterRow() {
return filter != null
&& filter.filterRow();
}
private boolean filterRowKey(byte[] row) {
return filter != null
&& filter.filterRowKey(row, 0, row.length);
}
protected void nextRow(byte [] currentRow) throws IOException {
while (Bytes.equals(currentRow, peekRow())) {
this.storeHeap.next(MOCKED_LIST);
}
results.clear();
resetFilters();
}
private byte[] peekRow() {
KeyValue kv = this.storeHeap.peek();
return kv == null ? null : kv.getRow();
}
private boolean isStopRow(byte [] currentRow) {
return currentRow == null ||
(stopRow != null &&
comparator.compareRows(stopRow, 0, stopRow.length,
currentRow, 0, currentRow.length) <= isScan);
}
public synchronized void close() {
if (storeHeap != null) {
storeHeap.close();
storeHeap = null;
}
this.filterClosed = true;
}
}
// Utility methods
/**
* A utility method to create new instances of HRegion based on the
* {@link HConstants#REGION_IMPL} configuration property.
* @param tableDir qualified path of directory where region should be located,
* usually the table directory.
* @param log The HLog is the outbound log for any updates to the HRegion
* (There's a single HLog for all the HRegions on a single HRegionServer.)
* The log file is a logfile from the previous execution that's
* custom-computed for this HRegion. The HRegionServer computes and sorts the
* appropriate log info for this HRegion. If there is a previous log file
* (implying that the HRegion has been written-to before), then read it from
* the supplied path.
* @param fs is the filesystem.
* @param conf is global configuration settings.
* @param regionInfo - HRegionInfo that describes the region
* is new), then read them from the supplied path.
* @param flushListener an object that implements CacheFlushListener or null
* making progress to master -- otherwise master might think region deploy
* failed. Can be null.
* @return the new instance
*/
public static HRegion newHRegion(Path tableDir, HLog log, FileSystem fs, Configuration conf,
HRegionInfo regionInfo, FlushRequester flushListener) {
try {
@SuppressWarnings("unchecked")
Class<? extends HRegion> regionClass =
(Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
Constructor<? extends HRegion> c =
regionClass.getConstructor(Path.class, HLog.class, FileSystem.class,
Configuration.class, HRegionInfo.class, FlushRequester.class);
return c.newInstance(tableDir, log, fs, conf, regionInfo, flushListener);
} catch (Throwable e) {
// todo: what should I throw here?
throw new IllegalStateException("Could not instantiate a region instance.", e);
}
}
/**
* Convenience method creating new HRegions. Used by createTable and by the
* bootstrap code in the HMaster constructor.
* Note, this method creates an {@link HLog} for the created region. It
* needs to be closed explicitly. Use {@link HRegion#getLog()} to get
* access.
* @param info Info for region to create.
* @param rootDir Root directory for HBase instance
* @param conf
* @return new HRegion
*
* @throws IOException
*/
public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
final Configuration conf)
throws IOException {
Path tableDir =
HTableDescriptor.getTableDir(rootDir, info.getTableDesc().getName());
Path regionDir = HRegion.getRegionDir(tableDir, info.getEncodedName());
FileSystem fs = FileSystem.get(conf);
fs.mkdirs(regionDir);
HRegion region = HRegion.newHRegion(tableDir,
new HLog(fs, new Path(regionDir, HConstants.HREGION_LOGDIR_NAME),
new Path(regionDir, HConstants.HREGION_OLDLOGDIR_NAME), conf),
fs, conf, info, null);
region.initialize();
return region;
}
/**
* Open a Region.
* @param info Info for region to be opened.
* @param wal HLog for region to use. This method will call
* HLog#setSequenceNumber(long) passing the result of the call to
* HRegion#getMinSequenceId() to ensure the log id is properly kept
* up. HRegionStore does this every time it opens a new region.
* @param conf
* @return new HRegion
*
* @throws IOException
*/
public static HRegion openHRegion(final HRegionInfo info, final HLog wal,
final Configuration conf)
throws IOException {
return openHRegion(info, wal, conf, null, null);
}
/**
* Open a Region.
* @param info Info for region to be opened.
* @param wal HLog for region to use. This method will call
* HLog#setSequenceNumber(long) passing the result of the call to
* HRegion#getMinSequenceId() to ensure the log id is properly kept
* up. HRegionStore does this every time it opens a new region.
* @param conf
* @param flusher An interface we can request flushes against.
* @param reporter An interface we can report progress against.
* @return new HRegion
*
* @throws IOException
*/
public static HRegion openHRegion(final HRegionInfo info, final HLog wal,
final Configuration conf, final FlushRequester flusher,
final CancelableProgressable reporter)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("Opening region: " + info);
}
if (info == null) {
throw new NullPointerException("Passed region info is null");
}
Path dir = HTableDescriptor.getTableDir(FSUtils.getRootDir(conf),
info.getTableDesc().getName());
HRegion r = HRegion.newHRegion(dir, wal, FileSystem.get(conf), conf, info,
flusher);
return r.openHRegion(reporter);
}
/**
* Open HRegion.
* Calls initialize and sets sequenceid.
* @param reporter
* @return Returns <code>this</code>
* @throws IOException
*/
protected HRegion openHRegion(final CancelableProgressable reporter)
throws IOException {
checkCompressionCodecs();
long seqid = initialize(reporter);
if (this.log != null) {
this.log.setSequenceNumber(seqid);
}
return this;
}
private void checkCompressionCodecs() throws IOException {
for (HColumnDescriptor fam: regionInfo.getTableDesc().getColumnFamilies()) {
CompressionTest.testCompression(fam.getCompression());
CompressionTest.testCompression(fam.getCompactionCompression());
}
}
/**
* Inserts a new region's meta information into the passed
* <code>meta</code> region. Used by the HMaster bootstrap code adding
* new table to ROOT table.
*
* @param meta META HRegion to be updated
* @param r HRegion to add to <code>meta</code>
*
* @throws IOException
*/
public static void addRegionToMETA(HRegion meta, HRegion r)
throws IOException {
meta.checkResources();
// The row key is the region name
byte[] row = r.getRegionName();
Integer lid = meta.obtainRowLock(row);
try {
final List<KeyValue> edits = new ArrayList<KeyValue>(1);
edits.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
HConstants.REGIONINFO_QUALIFIER,
EnvironmentEdgeManager.currentTimeMillis(),
Writables.getBytes(r.getRegionInfo())));
meta.put(HConstants.CATALOG_FAMILY, edits);
} finally {
meta.releaseRowLock(lid);
}
}
/**
* Deletes all the files for a HRegion
*
* @param fs the file system object
* @param rootdir qualified path of HBase root directory
* @param info HRegionInfo for region to be deleted
* @throws IOException
*/
public static void deleteRegion(FileSystem fs, Path rootdir, HRegionInfo info)
throws IOException {
deleteRegion(fs, HRegion.getRegionDir(rootdir, info));
}
private static void deleteRegion(FileSystem fs, Path regiondir)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("DELETING region " + regiondir.toString());
}
if (!fs.delete(regiondir, true)) {
LOG.warn("Failed delete of " + regiondir);
}
}
/**
* Computes the Path of the HRegion
*
* @param rootdir qualified path of HBase root directory
* @param info HRegionInfo for the region
* @return qualified path of region directory
*/
public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
return new Path(
HTableDescriptor.getTableDir(rootdir, info.getTableDesc().getName()),
info.getEncodedName());
}
/**
* Determines if the specified row is within the row range specified by the
* specified HRegionInfo
*
* @param info HRegionInfo that specifies the row range
* @param row row to be checked
* @return true if the row is within the range specified by the HRegionInfo
*/
public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
return ((info.getStartKey().length == 0) ||
(Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
((info.getEndKey().length == 0) ||
(Bytes.compareTo(info.getEndKey(), row) > 0));
}
/**
* Make the directories for a specific column family
*
* @param fs the file system
* @param tabledir base directory where region will live (usually the table dir)
* @param hri
* @param colFamily the column family
* @throws IOException
*/
public static void makeColumnFamilyDirs(FileSystem fs, Path tabledir,
final HRegionInfo hri, byte [] colFamily)
throws IOException {
Path dir = Store.getStoreHomedir(tabledir, hri.getEncodedName(), colFamily);
if (!fs.mkdirs(dir)) {
LOG.warn("Failed to create " + dir);
}
}
/**
* Merge two HRegions. The regions must be adjacent and must not overlap.
*
* @param srcA
* @param srcB
* @return new merged HRegion
* @throws IOException
*/
public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
throws IOException {
HRegion a = srcA;
HRegion b = srcB;
// Make sure that srcA comes first; important for key-ordering during
// write of the merged file.
if (srcA.getStartKey() == null) {
if (srcB.getStartKey() == null) {
throw new IOException("Cannot merge two regions with null start key");
}
// A's start key is null but B's isn't. Assume A comes before B
} else if ((srcB.getStartKey() == null) ||
(Bytes.compareTo(srcA.getStartKey(), srcB.getStartKey()) > 0)) {
a = srcB;
b = srcA;
}
if (!(Bytes.compareTo(a.getEndKey(), b.getStartKey()) == 0)) {
throw new IOException("Cannot merge non-adjacent regions");
}
return merge(a, b);
}
/**
* Merge two regions whether they are adjacent or not.
*
* @param a region a
* @param b region b
* @return new merged region
* @throws IOException
*/
public static HRegion merge(HRegion a, HRegion b) throws IOException {
if (!a.getRegionInfo().getTableDesc().getNameAsString().equals(
b.getRegionInfo().getTableDesc().getNameAsString())) {
throw new IOException("Regions do not belong to the same table");
}
FileSystem fs = a.getFilesystem();
// Make sure each region's cache is empty
a.flushcache();
b.flushcache();
// Compact each region so we only have one store file per family
a.compactStores(true);
if (LOG.isDebugEnabled()) {
LOG.debug("Files for region: " + a);
listPaths(fs, a.getRegionDir());
}
b.compactStores(true);
if (LOG.isDebugEnabled()) {
LOG.debug("Files for region: " + b);
listPaths(fs, b.getRegionDir());
}
Configuration conf = a.getConf();
HTableDescriptor tabledesc = a.getTableDesc();
HLog log = a.getLog();
Path tableDir = a.getTableDir();
// Presume both are of same region type -- i.e. both user or catalog
// table regions. This way can use comparator.
final byte[] startKey =
(a.comparator.matchingRows(a.getStartKey(), 0, a.getStartKey().length,
HConstants.EMPTY_BYTE_ARRAY, 0, HConstants.EMPTY_BYTE_ARRAY.length)
|| b.comparator.matchingRows(b.getStartKey(), 0,
b.getStartKey().length, HConstants.EMPTY_BYTE_ARRAY, 0,
HConstants.EMPTY_BYTE_ARRAY.length))
? HConstants.EMPTY_BYTE_ARRAY
: (a.comparator.compareRows(a.getStartKey(), 0, a.getStartKey().length,
b.getStartKey(), 0, b.getStartKey().length) <= 0
? a.getStartKey()
: b.getStartKey());
final byte[] endKey =
(a.comparator.matchingRows(a.getEndKey(), 0, a.getEndKey().length,
HConstants.EMPTY_BYTE_ARRAY, 0, HConstants.EMPTY_BYTE_ARRAY.length)
|| a.comparator.matchingRows(b.getEndKey(), 0, b.getEndKey().length,
HConstants.EMPTY_BYTE_ARRAY, 0,
HConstants.EMPTY_BYTE_ARRAY.length))
? HConstants.EMPTY_BYTE_ARRAY
: (a.comparator.compareRows(a.getEndKey(), 0, a.getEndKey().length,
b.getEndKey(), 0, b.getEndKey().length) <= 0
? b.getEndKey()
: a.getEndKey());
HRegionInfo newRegionInfo = new HRegionInfo(tabledesc, startKey, endKey);
LOG.info("Creating new region " + newRegionInfo.toString());
String encodedName = newRegionInfo.getEncodedName();
Path newRegionDir = HRegion.getRegionDir(a.getTableDir(), encodedName);
if(fs.exists(newRegionDir)) {
throw new IOException("Cannot merge; target file collision at " +
newRegionDir);
}
fs.mkdirs(newRegionDir);
LOG.info("starting merge of regions: " + a + " and " + b +
" into new region " + newRegionInfo.toString() +
" with start key <" + Bytes.toString(startKey) + "> and end key <" +
Bytes.toString(endKey) + ">");
// Move HStoreFiles under new region directory
Map<byte [], List<StoreFile>> byFamily =
new TreeMap<byte [], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
byFamily = filesByFamily(byFamily, a.close());
byFamily = filesByFamily(byFamily, b.close());
for (Map.Entry<byte [], List<StoreFile>> es : byFamily.entrySet()) {
byte [] colFamily = es.getKey();
makeColumnFamilyDirs(fs, tableDir, newRegionInfo, colFamily);
// Because we compacted the source regions we should have no more than two
// HStoreFiles per family and there will be no reference store
List<StoreFile> srcFiles = es.getValue();
if (srcFiles.size() == 2) {
long seqA = srcFiles.get(0).getMaxSequenceId();
long seqB = srcFiles.get(1).getMaxSequenceId();
if (seqA == seqB) {
// Can't have same sequenceid since on open of a store, this is what
// distingushes the files (see the map of stores how its keyed by
// sequenceid).
throw new IOException("Files have same sequenceid: " + seqA);
}
}
for (StoreFile hsf: srcFiles) {
StoreFile.rename(fs, hsf.getPath(),
StoreFile.getUniqueFile(fs, Store.getStoreHomedir(tableDir,
newRegionInfo.getEncodedName(), colFamily)));
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Files for new region");
listPaths(fs, newRegionDir);
}
HRegion dstRegion = HRegion.newHRegion(tableDir, log, fs, conf, newRegionInfo, null);
dstRegion.initialize();
dstRegion.compactStores();
if (LOG.isDebugEnabled()) {
LOG.debug("Files for new region");
listPaths(fs, dstRegion.getRegionDir());
}
deleteRegion(fs, a.getRegionDir());
deleteRegion(fs, b.getRegionDir());
LOG.info("merge completed. New region is " + dstRegion);
return dstRegion;
}
/*
* Fills a map with a vector of store files keyed by column family.
* @param byFamily Map to fill.
* @param storeFiles Store files to process.
* @param family
* @return Returns <code>byFamily</code>
*/
private static Map<byte [], List<StoreFile>> filesByFamily(
Map<byte [], List<StoreFile>> byFamily, List<StoreFile> storeFiles) {
for (StoreFile src: storeFiles) {
byte [] family = src.getFamily();
List<StoreFile> v = byFamily.get(family);
if (v == null) {
v = new ArrayList<StoreFile>();
byFamily.put(family, v);
}
v.add(src);
}
return byFamily;
}
/**
* @return True if needs a mojor compaction.
* @throws IOException
*/
boolean isMajorCompaction() throws IOException {
for (Store store: this.stores.values()) {
if (store.isMajorCompaction()) {
return true;
}
}
return false;
}
/*
* List the files under the specified directory
*
* @param fs
* @param dir
* @throws IOException
*/
private static void listPaths(FileSystem fs, Path dir) throws IOException {
if (LOG.isDebugEnabled()) {
FileStatus[] stats = fs.listStatus(dir);
if (stats == null || stats.length == 0) {
return;
}
for (int i = 0; i < stats.length; i++) {
String path = stats[i].getPath().toString();
if (stats[i].isDir()) {
LOG.debug("d " + path);
listPaths(fs, stats[i].getPath());
} else {
LOG.debug("f " + path + " size=" + stats[i].getLen());
}
}
}
}
//
// HBASE-880
//
/**
* @param get get object
* @param lockid existing lock id, or null for no previous lock
* @return result
* @throws IOException read exceptions
*/
public Result get(final Get get, final Integer lockid) throws IOException {
// Verify families are all valid
if (get.hasFamilies()) {
for (byte [] family: get.familySet()) {
checkFamily(family);
}
} else { // Adding all families to scanner
for (byte[] family: regionInfo.getTableDesc().getFamiliesKeys()) {
get.addFamily(family);
}
}
List<KeyValue> result = get(get);
return new Result(result);
}
/**
* An optimized version of {@link #get(Get)} that checks MemStore first for
* the specified query.
* <p>
* This is intended for use by increment operations where we have the
* guarantee that versions are never inserted out-of-order so if a value
* exists in MemStore it is the latest value.
* <p>
* It only makes sense to use this method without a TimeRange and maxVersions
* equal to 1.
* @param get
* @return result
* @throws IOException
*/
private List<KeyValue> getLastIncrement(final Get get) throws IOException {
InternalScan iscan = new InternalScan(get);
List<KeyValue> results = new ArrayList<KeyValue>();
// memstore scan
iscan.checkOnlyMemStore();
InternalScanner scanner = null;
try {
scanner = getScanner(iscan);
scanner.next(results);
} finally {
if (scanner != null)
scanner.close();
}
// count how many columns we're looking for
int expected = 0;
Map<byte[], NavigableSet<byte[]>> familyMap = get.getFamilyMap();
for (NavigableSet<byte[]> qfs : familyMap.values()) {
expected += qfs.size();
}
// found everything we were looking for, done
if (results.size() == expected) {
return results;
}
// still have more columns to find
if (results != null && !results.isEmpty()) {
// subtract what was found in memstore
for (KeyValue kv : results) {
byte [] family = kv.getFamily();
NavigableSet<byte[]> qfs = familyMap.get(family);
qfs.remove(kv.getQualifier());
if (qfs.isEmpty()) familyMap.remove(family);
expected--;
}
// make a new get for just what is left
Get newGet = new Get(get.getRow());
for (Map.Entry<byte[], NavigableSet<byte[]>> f : familyMap.entrySet()) {
byte [] family = f.getKey();
for (byte [] qualifier : f.getValue()) {
newGet.addColumn(family, qualifier);
}
}
newGet.setTimeRange(get.getTimeRange().getMin(),
get.getTimeRange().getMax());
iscan = new InternalScan(newGet);
}
// check store files for what is left
List<KeyValue> fileResults = new ArrayList<KeyValue>();
iscan.checkOnlyStoreFiles();
scanner = null;
try {
scanner = getScanner(iscan);
scanner.next(fileResults);
} finally {
if (scanner != null)
scanner.close();
}
// combine and return
results.addAll(fileResults);
Collections.sort(results, KeyValue.COMPARATOR);
return results;
}
/*
* Do a get based on the get parameter.
*/
private List<KeyValue> get(final Get get) throws IOException {
Scan scan = new Scan(get);
List<KeyValue> results = new ArrayList<KeyValue>();
InternalScanner scanner = null;
try {
scanner = getScanner(scan);
scanner.next(results);
} finally {
if (scanner != null)
scanner.close();
}
return results;
}
/**
* Perform one or more increment operations on a row.
* <p>
* Increments performed are done under row lock but reads do not take locks
* out so this can be seen partially complete by gets and scans.
* @param increment
* @param lockid
* @param writeToWAL
* @return new keyvalues after increment
* @throws IOException
*/
public Result increment(Increment increment, Integer lockid,
boolean writeToWAL)
throws IOException {
// TODO: Use RWCC to make this set of increments atomic to reads
byte [] row = increment.getRow();
checkRow(row);
TimeRange tr = increment.getTimeRange();
boolean flush = false;
WALEdit walEdits = null;
List<KeyValue> allKVs = new ArrayList<KeyValue>(increment.numColumns());
List<KeyValue> kvs = new ArrayList<KeyValue>(increment.numColumns());
long now = EnvironmentEdgeManager.currentTimeMillis();
long size = 0;
// Lock row
startRegionOperation();
try {
Integer lid = getLock(lockid, row, true);
this.updatesLock.readLock().lock();
try {
// Process each family
for (Map.Entry<byte [], NavigableMap<byte [], Long>> family :
increment.getFamilyMap().entrySet()) {
Store store = stores.get(family.getKey());
// Get previous values for all columns in this family
Get get = new Get(row);
for (Map.Entry<byte [], Long> column : family.getValue().entrySet()) {
get.addColumn(family.getKey(), column.getKey());
}
get.setTimeRange(tr.getMin(), tr.getMax());
List<KeyValue> results = getLastIncrement(get);
// Iterate the input columns and update existing values if they were
// found, otherwise add new column initialized to the increment amount
int idx = 0;
for (Map.Entry<byte [], Long> column : family.getValue().entrySet()) {
long amount = column.getValue();
if (idx < results.size() &&
results.get(idx).matchingQualifier(column.getKey())) {
amount += Bytes.toLong(results.get(idx).getValue());
idx++;
}
// Append new incremented KeyValue to list
KeyValue newKV = new KeyValue(row, family.getKey(), column.getKey(),
now, Bytes.toBytes(amount));
kvs.add(newKV);
// Append update to WAL
if (writeToWAL) {
if (walEdits == null) {
walEdits = new WALEdit();
}
walEdits.add(newKV);
}
}
// Write the KVs for this family into the store
size += store.upsert(kvs);
allKVs.addAll(kvs);
kvs.clear();
}
// Actually write to WAL now
if (writeToWAL) {
this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
walEdits, now);
}
size = this.memstoreSize.addAndGet(size);
flush = isFlushSize(size);
} finally {
this.updatesLock.readLock().unlock();
releaseRowLock(lid);
}
} finally {
closeRegionOperation();
}
if (flush) {
// Request a cache flush. Do it outside update lock.
requestFlush();
}
return new Result(allKVs);
}
/**
*
* @param row
* @param family
* @param qualifier
* @param amount
* @param writeToWAL
* @return The new value.
* @throws IOException
*/
public long incrementColumnValue(byte [] row, byte [] family,
byte [] qualifier, long amount, boolean writeToWAL)
throws IOException {
checkRow(row);
boolean flush = false;
// Lock row
long result = amount;
startRegionOperation();
try {
Integer lid = obtainRowLock(row);
this.updatesLock.readLock().lock();
try {
Store store = stores.get(family);
// Get the old value:
Get get = new Get(row);
get.addColumn(family, qualifier);
List<KeyValue> results = getLastIncrement(get);
if (!results.isEmpty()) {
KeyValue kv = results.get(0);
byte [] buffer = kv.getBuffer();
int valueOffset = kv.getValueOffset();
result += Bytes.toLong(buffer, valueOffset, Bytes.SIZEOF_LONG);
}
// build the KeyValue now:
KeyValue newKv = new KeyValue(row, family,
qualifier, EnvironmentEdgeManager.currentTimeMillis(),
Bytes.toBytes(result));
// now log it:
if (writeToWAL) {
long now = EnvironmentEdgeManager.currentTimeMillis();
WALEdit walEdit = new WALEdit();
walEdit.add(newKv);
this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
walEdit, now);
}
// Now request the ICV to the store, this will set the timestamp
// appropriately depending on if there is a value in memcache or not.
// returns the change in the size of the memstore from operation
long size = store.updateColumnValue(row, family, qualifier, result);
size = this.memstoreSize.addAndGet(size);
flush = isFlushSize(size);
} finally {
this.updatesLock.readLock().unlock();
releaseRowLock(lid);
}
} finally {
closeRegionOperation();
}
if (flush) {
// Request a cache flush. Do it outside update lock.
requestFlush();
}
return result;
}
//
// New HBASE-880 Helpers
//
private void checkFamily(final byte [] family)
throws NoSuchColumnFamilyException {
if(!regionInfo.getTableDesc().hasFamily(family)) {
throw new NoSuchColumnFamilyException("Column family " +
Bytes.toString(family) + " does not exist in region " + this
+ " in table " + regionInfo.getTableDesc());
}
}
public static final long FIXED_OVERHEAD = ClassSize.align(
(4 * Bytes.SIZEOF_LONG) + Bytes.SIZEOF_BOOLEAN +
(21 * ClassSize.REFERENCE) + ClassSize.OBJECT + Bytes.SIZEOF_INT);
public static final long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
(ClassSize.OBJECT * 2) + (2 * ClassSize.ATOMIC_BOOLEAN) +
ClassSize.ATOMIC_LONG + ClassSize.ATOMIC_INTEGER +
// Using TreeMap for TreeSet
ClassSize.TREEMAP +
// Using TreeMap for HashMap
ClassSize.TREEMAP +
ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY +
ClassSize.align(ClassSize.OBJECT +
(5 * Bytes.SIZEOF_BOOLEAN)) +
(3 * ClassSize.REENTRANT_LOCK));
public long heapSize() {
long heapSize = DEEP_OVERHEAD;
for(Store store : this.stores.values()) {
heapSize += store.heapSize();
}
return heapSize;
}
/*
* This method calls System.exit.
* @param message Message to print out. May be null.
*/
private static void printUsageAndExit(final String message) {
if (message != null && message.length() > 0) System.out.println(message);
System.out.println("Usage: HRegion CATLALOG_TABLE_DIR [major_compact]");
System.out.println("Options:");
System.out.println(" major_compact Pass this option to major compact " +
"passed region.");
System.out.println("Default outputs scan of passed region.");
System.exit(1);
}
/*
* Process table.
* Do major compaction or list content.
* @param fs
* @param p
* @param log
* @param c
* @param majorCompact
* @throws IOException
*/
private static void processTable(final FileSystem fs, final Path p,
final HLog log, final Configuration c,
final boolean majorCompact)
throws IOException {
HRegion region = null;
String rootStr = Bytes.toString(HConstants.ROOT_TABLE_NAME);
String metaStr = Bytes.toString(HConstants.META_TABLE_NAME);
// Currently expects tables have one region only.
if (p.getName().startsWith(rootStr)) {
region = HRegion.newHRegion(p, log, fs, c, HRegionInfo.ROOT_REGIONINFO, null);
} else if (p.getName().startsWith(metaStr)) {
region = HRegion.newHRegion(p, log, fs, c, HRegionInfo.FIRST_META_REGIONINFO,
null);
} else {
throw new IOException("Not a known catalog table: " + p.toString());
}
try {
region.initialize();
if (majorCompact) {
region.compactStores(true);
} else {
// Default behavior
Scan scan = new Scan();
// scan.addFamily(HConstants.CATALOG_FAMILY);
InternalScanner scanner = region.getScanner(scan);
try {
List<KeyValue> kvs = new ArrayList<KeyValue>();
boolean done = false;
do {
kvs.clear();
done = scanner.next(kvs);
if (kvs.size() > 0) LOG.info(kvs);
} while (done);
} finally {
scanner.close();
}
// System.out.println(region.getClosestRowBefore(Bytes.toBytes("GeneratedCSVContent2,E3652782193BC8D66A0BA1629D0FAAAB,9993372036854775807")));
}
} finally {
region.close();
}
}
/**
* For internal use in forcing splits ahead of file size limit.
* @param b
* @return previous value
*/
public boolean shouldSplit(boolean b) {
boolean old = this.splitRequest;
this.splitRequest = b;
return old;
}
/**
* Give the region a chance to prepare before it is split.
*/
protected void prepareToSplit() {
// nothing
}
/**
* @return The priority that this region should have in the compaction queue
*/
public int getCompactPriority() {
int count = Integer.MAX_VALUE;
for(Store store : stores.values()) {
count = Math.min(count, store.getCompactPriority());
}
return count;
}
/**
* Checks every store to see if one has too many
* store files
* @return true if any store has too many store files
*/
public boolean hasTooManyStoreFiles() {
for(Store store : stores.values()) {
if(store.hasTooManyStoreFiles()) {
return true;
}
}
return false;
}
/**
* This method needs to be called before any public call that reads or
* modifies data. It has to be called just before a try.
* #closeRegionOperation needs to be called in the try's finally block
* Acquires a read lock and checks if the region is closing or closed.
* @throws NotServingRegionException when the region is closing or closed
*/
private void startRegionOperation() throws NotServingRegionException {
if (this.closing.get()) {
throw new NotServingRegionException(regionInfo.getRegionNameAsString() +
" is closing");
}
lock.readLock().lock();
if (this.closed.get()) {
lock.readLock().unlock();
throw new NotServingRegionException(regionInfo.getRegionNameAsString() +
" is closed");
}
}
/**
* Closes the lock. This needs to be called in the finally block corresponding
* to the try block of #startRegionOperation
*/
private void closeRegionOperation(){
lock.readLock().unlock();
}
/**
* A mocked list implementaion - discards all updates.
*/
private static final List<KeyValue> MOCKED_LIST = new AbstractList<KeyValue>() {
@Override
public void add(int index, KeyValue element) {
// do nothing
}
@Override
public boolean addAll(int index, Collection<? extends KeyValue> c) {
return false; // this list is never changed as a result of an update
}
@Override
public KeyValue get(int index) {
throw new UnsupportedOperationException();
}
@Override
public int size() {
return 0;
}
};
/**
* Facility for dumping and compacting catalog tables.
* Only does catalog tables since these are only tables we for sure know
* schema on. For usage run:
* <pre>
* ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
* </pre>
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if (args.length < 1) {
printUsageAndExit(null);
}
boolean majorCompact = false;
if (args.length > 1) {
if (!args[1].toLowerCase().startsWith("major")) {
printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
}
majorCompact = true;
}
final Path tableDir = new Path(args[0]);
final Configuration c = HBaseConfiguration.create();
final FileSystem fs = FileSystem.get(c);
final Path logdir = new Path(c.get("hbase.tmp.dir"),
"hlog" + tableDir.getName()
+ EnvironmentEdgeManager.currentTimeMillis());
final Path oldLogDir = new Path(c.get("hbase.tmp.dir"),
HConstants.HREGION_OLDLOGDIR_NAME);
final HLog log = new HLog(fs, logdir, oldLogDir, c);
try {
processTable(fs, tableDir, log, c, majorCompact);
} finally {
log.close();
BlockCache bc = StoreFile.getBlockCache(c);
if (bc != null) bc.shutdown();
}
}
}