/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.sun.jini.outrigger; import com.sun.jini.constants.TxnConstants; import com.sun.jini.constants.ThrowableConstants; import com.sun.jini.logging.Levels; import com.sun.jini.thread.RetryTask; import com.sun.jini.thread.TaskManager; import com.sun.jini.thread.WakeupManager; import java.io.IOException; import java.rmi.RemoteException; import java.rmi.UnmarshalException; import java.rmi.NoSuchObjectException; import java.util.Map; import java.util.WeakHashMap; import java.util.Iterator; import java.util.Collection; import java.util.Collections; import java.util.logging.Level; import java.util.logging.Logger; import net.jini.core.transaction.TransactionException; import net.jini.core.transaction.UnknownTransactionException; import net.jini.core.transaction.server.ServerTransaction; import net.jini.core.transaction.server.TransactionConstants; /** * A task that will try to validate the state of a transaction. This * uses weak references a good deal to let the other parts of the system * be GC'ed as necessary. * <p> * The retry mechanism is subtle, so bear with me. The purpose is * to ensure that if any activity is being blocked by a given * transaction, that transaction will be tested at some point in * the future (if necessary, i.e., if it still is thought to be * active). We assume it to be rare that a transactions that the * space thinks is active is, in fact, aborted, so the algorithm is * designed to guarantee the detection without a lot of overhead, * specifically without a lot of RMI calls. * <p> * Each task has three values: a <code>nextQuery</code> time, a * <code>mustQuery</code> boolean that force the next query to be * made, and <code>deltaT</code>, the time at which the following * query will be scheduled. When the task is awakened at its * <code>nextQuery</code> time, it checks to see if it must make an * actual query to the transaction manager, which it will do if either * <code>mustQuery</code> is <code>true</code>, or if we know about * any in progress queries on the space that are blocked on the * transaction. Whether or not an actual query is made, * <code>deltaT</code> is added to <code>nextQuery</code> to get the * <code>nextQuery</code> time, <code>deltaT</code> is doubled, and * <code>mustQuery</code> boolean is set to <code>false</code>. * <p> * There are two kinds of requests that a with which transaction * can cause a conflict -- those with long timeouts (such as * blocking reads and takes) and those that are under short timeouts * (such as reads and takes with zero-length timeouts). We will * treat them separately at several points of the algorithm. A * short timeout is any query whose expiration time is sooner than * the <code>nextQuery</code> time. Any other timeout is long * If a short query arrives, <code>mustQuery</code> is set to * <code>true</code>. * <p> * The result is that any time a transaction causes a conflict, if * the query on the space has not ended by the time of the * <code>nextQuery</code> we will attempt to poll the transaction manager. * There will also poll the transaction manager if any conflict occurred * on a query on the space with a short timeout. * <p> * The first time a transaction causes a conflict, we schedule a * time in the future at which we will poll its status. We do not * poll right away because often a transaction will complete on * its own before we get to that time, making the check * unnecessary. An instant poll is, therefore, unnecessarily * aggressive, since giving an initial grace time will usually mean * no poll is made at all. So if the first conflict occurs at * <i>T</i><sub>0</sub>, the <code>nextQuery</code> value will be * <i>T</i><sub>0</sub><code>+INITIAL_GRACE</code>, the boolean * will be <code>true</code> to force that poll to happen, and * <code>deltaT</code> will be set to <code>INITIAL_GRACE</code>. * * @author Sun Microsystems, Inc. * * @see TxnMonitor */ class TxnMonitorTask extends RetryTask implements TransactionConstants, com.sun.jini.constants.TimeConstants { /** transaction being monitored */ private final Txn txn; /** the monitor we were made by */ private final TxnMonitor monitor; /** * All the queries on the space (not queries to the transaction * manager) waiting for <code>txn</code> to be resolved. * <code>null</code> until we have at least one. Represented by * <code>QueryWatcher</code> objects. */ private Map queries; /** count of RemoteExceptions */ private int failCnt; /** * The next time we need to poll the transaction manager * to get <code>txn</code>'s actual state. */ private long nextQuery; /** * When we're given an opportunity to poll the transaction manager * for the <code>txn</code>'s state, do so. */ private boolean mustQuery; /** next value added to <code>nextQuery</code> */ private long deltaT; /** * The initial grace period before the first query. */ private static final long INITIAL_GRACE = 15 * SECONDS; /** * The retry time when we have an encountered an exception */ private static final long BETWEEN_EXCEPTIONS = 15 * SECONDS; /** * The largest value that <code>deltaT</code> will reach. */ private static final long MAX_DELTA_T = 1 * HOURS; /** * The maximum number of failures allowed in a row before we simply * give up on the transaction and consider it aborted. */ private static final int MAX_FAILURES = 3; /** Logger for logging transaction related information */ private static final Logger logger = Logger.getLogger(OutriggerServerImpl.txnLoggerName); /** * Create a new TxnMonitorTask. */ TxnMonitorTask(Txn txn, TxnMonitor monitor, TaskManager manager, WakeupManager wakeupMgr) { super(manager, wakeupMgr); this.txn = txn; this.monitor = monitor; nextQuery = startTime(); // retryTime will add INITIAL_GRACE deltaT = INITIAL_GRACE; mustQuery = true; } /** * Return the time of the next query, bumping <code>deltaT</code> as * necessary for the next iteration. If the transaction has voted * <code>PREPARED</code> or the manager has been giving us a * <code>RemoteException</code>, we should retry on short times; * otherwise we back off quickly. */ public long retryTime() { if (failCnt == 0 && txn.getState() != PREPARED) { // no failures if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "{0} retryTime adds {1}", new Object[]{this, new Long(deltaT)}); } nextQuery += deltaT; if (deltaT < MAX_DELTA_T) deltaT = Math.min(deltaT * 2, MAX_DELTA_T); } else { if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "{0} retryTime adds {1} (for {2})", new Object[]{this, new Long(BETWEEN_EXCEPTIONS), (failCnt != 0 ? "failure" : "PREPARED")}); } nextQuery += BETWEEN_EXCEPTIONS; } return nextQuery; } /** * We can run in parallel with any task, so just return * <CODE>false</CODE>. */ public boolean runAfter(java.util.List tasks, int size) { return false; } /** * Add a ``sibling'' transaction, one that is now blocking progress * on one of the same entries. For example, if a client is blocked * on a <code>read</code>, another transaction can read the same * entry, thereby also blocking that same client. This means that * the transaction for the second <code>read</code> must be * watched, too. The list of queries for the second transaction * might be less that the list of those in this transaction, but * the process of figuring out the subset is too expensive, since * we have tried to make the checking process itself cheap, * anyway. So we add all queries this task is currently monitoring * to the task monitoring the second transaction. If there are * no queries, then the blocking occurred because of a short query * or all the queries have expired, in which case the second transaction * isn't blocking the way of anything currently, so this method does * nothing. * <p> * Of course, in order to avoid blocking the thread that is calling * this (which is trying to perform a <code>read</code>, after * all), we simply add each lease in this task to the monitor's * queue. * * @see TxnEntryHandle#monitor */ //!! Would it be worth the overhead to make TxnEntryHandle.monitor //!! search for the transaction with the smallest set of leases? -arnold synchronized void addSibling(Txn txn) { if (queries == null || queries.size() == 0) return; Collection sibling = Collections.nCopies(1, txn); Iterator it = queries.keySet().iterator(); while (it.hasNext()) { QueryWatcher query = (QueryWatcher)it.next(); if (query != null) // from a weak map, so might be null monitor.add(query, sibling); } } /** * Try to see if this transaction should be aborted. This returns * <code>true</code> (don't repeat the task) if it knows that the * transaction is no longer interesting to anyone. */ public boolean tryOnce() { if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "{0} attempt {1} mustQuery:{2}", new Object[]{this, new Integer(attempt()), new Boolean(mustQuery) }); } /* * The first time we do nothing, since RetryTask invokes run first, * but we want to wait a bit before testing the transaction. */ if (attempt() == 0) return false; if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "{0} txn.getState() = {1}", new Object[]{this, new Integer(txn.getState())}); } // not active or prepared == no longer blocking int txnState = txn.getState(); if (txnState != ACTIVE && txnState != PREPARED) return true; // if we're prepared, test every time -- this shouldn't take long mustQuery |= (txnState == PREPARED); /* * Go through the resources to see if we can find one still active * that cares. Must be synchronized since we test, then clear -- * another thread that set the flag between the test and clear * would have its requirements lost. */ synchronized (this) { if (!mustQuery) { // then try resources if (queries == null) // no resources, so nobody wants it return false; // try again next time Iterator it = queries.keySet().iterator(); boolean foundNeed = false; if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "{0} nextQuery {1}", new Object[]{this, new Long(nextQuery)}); } while (it.hasNext()) { QueryWatcher query = (QueryWatcher)it.next(); if (query == null) // gone -- the map will reap it continue; if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "{0} query.getExpiration() {1}", new Object[]{this, new Long(query.getExpiration())}); } if (query.getExpiration() < nextQuery || query.isResolved()) it.remove(); // expired, so we don't care about it else { foundNeed = true; break; } } if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "{0} foundNeed = {1}", new Object[]{this, new Boolean(foundNeed)}); } if (!foundNeed) // nobody wants it return false; // try again next time } mustQuery = false; // clear it for next time } /* * Now we know (a) the transaction itself is alive, and (b) some * lease still cares. Make sure it's still active as far as the * it knows, and if it is, then ask the manager about it. */ ServerTransaction tr; try { /* This may fix a broken Txn, if it does it won't get moved * from the broken to the unbroken list. It will get * moved eventually, but it does seem unfortunate it does * not happen immediately */ tr = txn.getTransaction( monitor.space().getRecoveredTransactionManagerPreparer()); } catch (RemoteException e) { final int cat = ThrowableConstants.retryable(e); if (cat == ThrowableConstants.BAD_INVOCATION || cat == ThrowableConstants.BAD_OBJECT) { // Not likely to get better, give up logUnpackingFailure("definite exception", Level.INFO, true, e); return true; } else if (cat == ThrowableConstants.INDEFINITE) { // try, try, again logUnpackingFailure("indefinite exception", Levels.FAILED, false, e); mustQuery = true; return false; } else if (cat == ThrowableConstants.UNCATEGORIZED) { // Same as above but log differently. mustQuery = true; logUnpackingFailure("uncategorized exception", Level.INFO, false, e); return false; } else { logger.log(Level.WARNING, "ThrowableConstants.retryable " + "returned out of range value, " + cat, new AssertionError(e)); return false; } } catch (IOException e) { // Not likely to get better logUnpackingFailure("IOException", Level.INFO, true, e); return true; } catch (RuntimeException e) { // Not likely to get better logUnpackingFailure("RuntimeException", Level.INFO, true, e); return true; } catch (ClassNotFoundException e) { // codebase probably down, keep trying logUnpackingFailure("ClassNotFoundException", Levels.FAILED, false, e); mustQuery = true; return false; } if (logger.isLoggable(Level.FINEST)) logger.log(Level.FINEST, "{0} tr = {1}", new Object[]{this, tr}); int trState; try { trState = tr.getState(); } catch (TransactionException e) { if (logger.isLoggable(Level.INFO)) logger.log(Level.INFO, "Got TransactionException when " + "calling getState on " + tr + ", dropping transaction " + tr.id, e); trState = ABORTED; } catch (NoSuchObjectException e) { /* It would be epsilon better to to give up immediately * if we get a NoSuchObjectException and we are in the * active state, however, the code to do this would * be very complicated since we need to hold a lock to * while reading and acting on the state. */ if (++failCnt >= MAX_FAILURES) { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Got NoSuchObjectException when " + "calling getState on " + tr + ", this was the " + failCnt + " RemoteException, dropping transaction" + tr.id, e); } trState = ABORTED; } else { if (logger.isLoggable(Levels.FAILED)) { logger.log(Levels.FAILED, "Got NoSuchObjectException " + "when calling getState on " + tr + ", failCount = " + failCnt + ", will retry", e); } mustQuery = true; // keep on trying return false; // try again next time } } catch (RemoteException e) { if (++failCnt >= MAX_FAILURES) { /* abort if we are not prepared and not already * aborted. If prepared retry, otherwise * we're done. Check state and make any abort() call * atomically so we can't accidently abort a prepared * transaction. */ synchronized (txn) { switch (txn.getState()) { case ACTIVE: // Safe to abort, give up if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Got RemoteException " + "when calling getState on " + tr + ", this " + "was " + failCnt + " RemoteException, " + "dropping active transaction " + tr.id, e); } try { monitor.space().abort(tr.mgr, tr.id); return true; } catch (UnknownTransactionException ute) { throw new AssertionError(ute); } catch (UnmarshalException ume) { throw new AssertionError(ume); } case PREPARED: final Level l = (failCnt%MAX_FAILURES == 0)? Level.INFO:Levels.FAILED; if (logger.isLoggable(l)) { logger.log(l, "Got RemoteException when calling " + "getState on " + tr + ", this was " + failCnt + " RemoteException, will keep " + "prepared transaction " + tr.id, e); } // Can't give up, keep on trying to find real state mustQuery = true; return false; case ABORTED: case COMMITTED: // done return true; default: throw new AssertionError("Txn in unreachable state"); } } } else { // Don't know, but not ready to give up if (logger.isLoggable(Levels.FAILED)) { logger.log(Levels.FAILED, "Got RemoteException when " + "calling getState on " + tr + ", failCount = " + failCnt + ", will retry", e); } mustQuery = true; // keep on trying return false; // try again next time } } if (logger.isLoggable(Level.FINER)) { logger.log(Level.FINER, "{0} trState = {1}", new Object[]{this, new Integer(trState)}); } failCnt = 0; // reset failures -- we got a response /* * If the two states aren't the same, the state changed and we * need to account for that locally here by calling the method * that would make the change (the one we should have gotten. * (We use the external forms of abort, commit, etc., because * they are what the manager would call, and therefore these * calls will always do exactly what the incoming manager * calls would have done. I don't want this to be fragile by * bypassing those calls and going straight to the Txn * object's calls, which might skip something important in the * OutriggerServerImpl calls). */ if (trState != txnState) { if (logger.isLoggable(Level.FINER)) { logger.log(Level.FINER, "{0} mgr state[{1}] != local state [{2}]", new Object[]{this, TxnConstants.getName(trState), TxnConstants.getName(txnState)}); } try { switch (trState) { case ABORTED: logger.log(Level.FINER, "{0} moving to abort", this); monitor.space().abort(tr.mgr, tr.id); return true; case COMMITTED: logger.log(Level.FINER, "{0} moving to commit", this); monitor.space().commit(tr.mgr, tr.id); return true; } } catch (UnknownTransactionException e) { // we must somehow have already gotten the abort() or // commit(), and have therefore forgotten about the // transaction, while this code was executing return true; } catch (UnmarshalException ume) { throw new AssertionError(ume); } // we can't fake anything else -- the manager will have to call // us } logger.log(Level.FINEST, "{0} return false", this); return false; // now we know so nothing more to do } /** * Add in a resource. The lease may already be in, in which case it is * ignored, or it may be null, in which case it was a non-leased probe * that was blocked and we simply set <code>mustQuery</code> to * <code>true</code>. */ synchronized void add(QueryWatcher query) { if (query == null || query.getExpiration() <= nextQuery) { if (logger.isLoggable(Level.FINEST)) logger.log(Level.FINEST, "adding resource to task -- SHORT"); mustQuery = true; } else { if (logger.isLoggable(Level.FINEST)) logger.log(Level.FINEST, "adding resource to task -- LONG"); if (queries == null) queries = new WeakHashMap();// we use it like a WeakHashSet queries.put(query, null); } } /** Log failed unpacking attempt attempt */ private void logUnpackingFailure(String exceptionDescription, Level level, boolean terminal, Throwable t) { if (logger.isLoggable(level)) { logger.log(level, "Encountered " + exceptionDescription + "while unpacking exception to check state, " + (terminal?"dropping":"keeping") + " monitoring task", t); } } }