/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs; import com.google.common.collect.Iterators; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.hdfs.inotify.EventBatch; import org.apache.hadoop.hdfs.inotify.EventBatchList; import org.apache.hadoop.hdfs.inotify.MissingEventsException; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.util.Time; import org.apache.htrace.Sampler; import org.apache.htrace.Trace; import org.apache.htrace.TraceScope; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Iterator; import java.util.Random; import java.util.concurrent.TimeUnit; /** * Stream for reading inotify events. DFSInotifyEventInputStreams should not * be shared among multiple threads. */ @InterfaceAudience.Public @InterfaceStability.Unstable public class DFSInotifyEventInputStream { public static Logger LOG = LoggerFactory.getLogger(DFSInotifyEventInputStream .class); /** * The trace sampler to use when making RPCs to the NameNode. */ private final Sampler<?> traceSampler; private final ClientProtocol namenode; private Iterator<EventBatch> it; private long lastReadTxid; /** * The most recent txid the NameNode told us it has sync'ed -- helps us * determine how far behind we are in the edit stream. */ private long syncTxid; /** * Used to generate wait times in {@link DFSInotifyEventInputStream#take()}. */ private Random rng = new Random(); private static final int INITIAL_WAIT_MS = 10; DFSInotifyEventInputStream(Sampler<?> traceSampler, ClientProtocol namenode) throws IOException { // Only consider new transaction IDs. this(traceSampler, namenode, namenode.getCurrentEditLogTxid()); } DFSInotifyEventInputStream(Sampler traceSampler, ClientProtocol namenode, long lastReadTxid) throws IOException { this.traceSampler = traceSampler; this.namenode = namenode; this.it = Iterators.emptyIterator(); this.lastReadTxid = lastReadTxid; } /** * Returns the next batch of events in the stream or null if no new * batches are currently available. * * @throws IOException because of network error or edit log * corruption. Also possible if JournalNodes are unresponsive in the * QJM setting (even one unresponsive JournalNode is enough in rare cases), * so catching this exception and retrying at least a few times is * recommended. * @throws MissingEventsException if we cannot return the next batch in the * stream because the data for the events (and possibly some subsequent * events) has been deleted (generally because this stream is a very large * number of transactions behind the current state of the NameNode). It is * safe to continue reading from the stream after this exception is thrown * The next available batch of events will be returned. */ public EventBatch poll() throws IOException, MissingEventsException { TraceScope scope = Trace.startSpan("inotifyPoll", traceSampler); try { // need to keep retrying until the NN sends us the latest committed txid if (lastReadTxid == -1) { LOG.debug("poll(): lastReadTxid is -1, reading current txid from NN"); lastReadTxid = namenode.getCurrentEditLogTxid(); return null; } if (!it.hasNext()) { EventBatchList el = namenode.getEditsFromTxid(lastReadTxid + 1); if (el.getLastTxid() != -1) { // we only want to set syncTxid when we were actually able to read some // edits on the NN -- otherwise it will seem like edits are being // generated faster than we can read them when the problem is really // that we are temporarily unable to read edits syncTxid = el.getSyncTxid(); it = el.getBatches().iterator(); long formerLastReadTxid = lastReadTxid; lastReadTxid = el.getLastTxid(); if (el.getFirstTxid() != formerLastReadTxid + 1) { throw new MissingEventsException(formerLastReadTxid + 1, el.getFirstTxid()); } } else { LOG.debug("poll(): read no edits from the NN when requesting edits " + "after txid {}", lastReadTxid); return null; } } if (it.hasNext()) { // can be empty if el.getLastTxid != -1 but none of the // newly seen edit log ops actually got converted to events return it.next(); } else { return null; } } finally { scope.close(); } } /** * Return a estimate of how many transaction IDs behind the NameNode's * current state this stream is. Clients should periodically call this method * and check if its result is steadily increasing, which indicates that they * are falling behind (i.e. transaction are being generated faster than the * client is reading them). If a client falls too far behind events may be * deleted before the client can read them. * <p/> * A return value of -1 indicates that an estimate could not be produced, and * should be ignored. The value returned by this method is really only useful * when compared to previous or subsequent returned values. */ public long getTxidsBehindEstimate() { if (syncTxid == 0) { return -1; } else { assert syncTxid >= lastReadTxid; // this gives the difference between the last txid we have fetched to the // client and syncTxid at the time we last fetched events from the // NameNode return syncTxid - lastReadTxid; } } /** * Returns the next event batch in the stream, waiting up to the specified * amount of time for a new batch. Returns null if one is not available at the * end of the specified amount of time. The time before the method returns may * exceed the specified amount of time by up to the time required for an RPC * to the NameNode. * * @param time number of units of the given TimeUnit to wait * @param tu the desired TimeUnit * @throws IOException see {@link DFSInotifyEventInputStream#poll()} * @throws MissingEventsException * see {@link DFSInotifyEventInputStream#poll()} * @throws InterruptedException if the calling thread is interrupted */ public EventBatch poll(long time, TimeUnit tu) throws IOException, InterruptedException, MissingEventsException { TraceScope scope = Trace.startSpan("inotifyPollWithTimeout", traceSampler); EventBatch next = null; try { long initialTime = Time.monotonicNow(); long totalWait = TimeUnit.MILLISECONDS.convert(time, tu); long nextWait = INITIAL_WAIT_MS; while ((next = poll()) == null) { long timeLeft = totalWait - (Time.monotonicNow() - initialTime); if (timeLeft <= 0) { LOG.debug("timed poll(): timed out"); break; } else if (timeLeft < nextWait * 2) { nextWait = timeLeft; } else { nextWait *= 2; } LOG.debug("timed poll(): poll() returned null, sleeping for {} ms", nextWait); Thread.sleep(nextWait); } } finally { scope.close(); } return next; } /** * Returns the next batch of events in the stream, waiting indefinitely if * a new batch is not immediately available. * * @throws IOException see {@link DFSInotifyEventInputStream#poll()} * @throws MissingEventsException see * {@link DFSInotifyEventInputStream#poll()} * @throws InterruptedException if the calling thread is interrupted */ public EventBatch take() throws IOException, InterruptedException, MissingEventsException { TraceScope scope = Trace.startSpan("inotifyTake", traceSampler); EventBatch next = null; try { int nextWaitMin = INITIAL_WAIT_MS; while ((next = poll()) == null) { // sleep for a random period between nextWaitMin and nextWaitMin * 2 // to avoid stampedes at the NN if there are multiple clients int sleepTime = nextWaitMin + rng.nextInt(nextWaitMin); LOG.debug("take(): poll() returned null, sleeping for {} ms", sleepTime); Thread.sleep(sleepTime); // the maximum sleep is 2 minutes nextWaitMin = Math.min(60000, nextWaitMin * 2); } } finally { scope.close(); } return next; } }