/* * Copyright 2013 Rackspace * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.rackspacecloud.blueflood.service; import com.codahale.metrics.Meter; import com.codahale.metrics.Timer; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Ticker; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.rackspacecloud.blueflood.io.Constants; import com.rackspacecloud.blueflood.rollup.Granularity; import com.rackspacecloud.blueflood.rollup.SlotKey; import com.rackspacecloud.blueflood.utils.Clock; import com.rackspacecloud.blueflood.utils.DefaultClockImpl; import com.rackspacecloud.blueflood.utils.Metrics; import com.rackspacecloud.blueflood.utils.TimeValue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.management.MBeanServer; import javax.management.ObjectName; import java.lang.management.ManagementFactory; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.TimeUnit; /** * * The {@code ScheduleContext} class coordinates access to the states of slots. * It mediates between the rollup service, the ingestion service, and the * database. * <p> * * The class's chief purpose is to coordinate access to slots' states and the * list of slots that need to be re-rolled between {@link RollupService}, * {@link IngestionService}, and the database. There's several other classes * and components involved, but those are the most important. * <p> * * When ingestion services receive incoming metrics, they will change the * associated slot's state to * {@link com.rackspacecloud.blueflood.service.UpdateStamp.State#Active Active} * (needs to be re-rolled) by calling {@link #update(long, int)}. That state * information will be persisted to the database by the * {@link ShardStatePusher} class. On the rollup nodes, the * {@link ShardStatePuller} will read the state info into memory by calling * {@link com.rackspacecloud.blueflood.service.ShardStateManager.SlotStateManager#updateSlotOnRead(SlotState)}. * The rollup service will then identity slots that need to be re-rolled (by * calling {@link #scheduleEligibleSlots(long, long, long)}), pick a slot to rollup and * mark it as * {@link com.rackspacecloud.blueflood.service.UpdateStamp.State#Running Running} * (via {@link #getNextScheduled()} ), do the rollup, and then mark the slot as * {@link com.rackspacecloud.blueflood.service.UpdateStamp.State#Rolled Rolled} * (via {@link #clearFromRunning(SlotKey)}). * <p> * * If doing the rollup fails for some reason, the rollup service will call * {@link #pushBackToScheduled(SlotKey, boolean)}, to return the slot to the * queue of slots to be rolled. * <p> * * There are additional methods for managing shards ({@link #addShard(int)} and * {@link #removeShard(int)}) which don't appear to be used. * <p> * * There are also methods which provide information about the state of things * (e.g. {@link #getScheduledCount()}, * {@link #getSlotStamps(Granularity, int)}, * {@link #getRecentlyScheduledShards()}, * {@link #getMetricsState(int, String, int)}) which either aren't used, are * only used for testing, or are used to provide information to external system * via JMX or yammer metrics. * <p> * * * Previous comments, left in case we need them: * Keeps track of dirty slots in memory. Operations must be threadsafe. * * todo: explore using ReadWrite locks (might not make a difference). * * Each node is responsible for sharded slots (time ranges) of rollups. This * class keeps track of the execution of those rollups and the states they are * in. * * When synchronizing multiple collections, do it in this order: scheduled -> running. */ public class ScheduleContext implements IngestionContext, ScheduleContextMBean { private static final Logger log = LoggerFactory.getLogger(ScheduleContext.class); private final Timer markSlotDirtyTimer = Metrics.timer(ScheduleContext.class, "Slot Mark Dirty Duration"); private final ShardStateManager shardStateManager; private transient long scheduleTime = 0L; /** these shards have been scheduled in the last 10 minutes. */ private final Cache<Integer, Long> recentlyScheduledShards = CacheBuilder.newBuilder() .maximumSize(Constants.NUMBER_OF_SHARDS) .expireAfterWrite(10, TimeUnit.MINUTES) .build(); // state // private final Meter shardOwnershipChanged = Metrics.meter(ScheduleContext.class, "Shard Change Before Running"); /** * these are all the slots that are scheduled to run in no particular order. * the collection is synchronized to control updates. */ private final Set<SlotKey> scheduledSlots = new HashSet<SlotKey>(); /** * same information as {@link #scheduledSlots}, but order is preserved. The * ordered property is only needed for getting the the next scheduled slot, * but most operations are concerned with if a slot is scheduled or not. * When you update one, you must update the other. */ private final List<SlotKey> orderedScheduledSlots = new ArrayList<SlotKey>(); /** slots that are running are not scheduled. */ private final Map<SlotKey, Long> runningSlots = new HashMap<SlotKey, Long>(); /** shard lock manager */ private final ShardLockManager lockManager; private final Clock clock; public ScheduleContext(long currentTimeMillis, Collection<Integer> managedShards, Clock clock) { this.scheduleTime = currentTimeMillis; this.shardStateManager = new ShardStateManager(managedShards, asMillisecondsSinceEpochTicker(), clock); this.lockManager = new NoOpShardLockManager(); this.clock = clock; registerMBean(); } public ScheduleContext(long currentTimeMillis, Collection<Integer> managedShards, String zookeeperCluster) { this.scheduleTime = currentTimeMillis; this.shardStateManager = new ShardStateManager(managedShards, asMillisecondsSinceEpochTicker()); ZKShardLockManager lockManager = new ZKShardLockManager(zookeeperCluster, new HashSet<Integer>(shardStateManager.getManagedShards())); lockManager.init(new TimeValue(5, TimeUnit.SECONDS)); this.lockManager = lockManager; this.clock = new DefaultClockImpl(); registerMBean(); } public ScheduleContext(long currentTimeMillis, Collection<Integer> managedShards) { this(currentTimeMillis, managedShards, new DefaultClockImpl()); } @VisibleForTesting public ScheduleContext(long currentTimeMillis, Collection<Integer> managedShards, Clock clock, ShardStateManager shardStateManager, ShardLockManager shardLockManager) { this.scheduleTime = currentTimeMillis; this.shardStateManager = shardStateManager; this.lockManager = shardLockManager; this.clock = clock; registerMBean(); } public void setCurrentTimeMillis(long millis){ scheduleTime = millis; } public long getCurrentTimeMillis() { return scheduleTime; } public ShardStateManager getShardStateManager() { return this.shardStateManager; } /** * {@inheritDoc} */ public void update(long millis, int shard) { // there are two update paths. for managed shards, we must guard the // scheduled and running collections. but for unmanaged shards, we just // let the update happen uncontested. final Timer.Context dirtyTimerCtx = markSlotDirtyTimer.time(); try { if (log.isTraceEnabled()) { log.trace("Updating {} to {}", shard, millis); } boolean isManaged = shardStateManager.contains(shard); for (Granularity g : Granularity.rollupGranularities()) { ShardStateManager.SlotStateManager slotStateManager = shardStateManager.getSlotStateManager(shard, g); int slot = g.slot(millis); if (isManaged) { synchronized (scheduledSlots) { //put SlotKey key = SlotKey.of(g, slot, shard); if (scheduledSlots.remove(key) && log.isDebugEnabled()) { // don't worry about orderedScheduledSlots log.debug("descheduled {}.", key); } } } slotStateManager.createOrUpdateForSlotAndMillisecond(slot, millis); } } finally { dirtyTimerCtx.stop(); } } /** * Loop through all slots that are eligible for rollup, at all * granularities, in all managed shards. If any are found that are not * already running or scheduled, then add them to the queue of scheduled * slots. * * Note that {@code maxAgeMillis}, {@code rollupDelayForMetricsWithShortDelay} * {@code rollupWaitForMetricsWithLongDelay} are age values, not a timestamp. * */ // only one thread should be calling in this puppy. void scheduleEligibleSlots(long maxAgeMillis, long rollupDelayForMetricsWithShortDelay, long rollupWaitForMetricsWithLongDelay) { long now = scheduleTime; ArrayList<Integer> shardKeys = new ArrayList<Integer>(shardStateManager.getManagedShards()); Collections.shuffle(shardKeys); for (int shard : shardKeys) { for (Granularity g : Granularity.rollupGranularities()) { // sync on map since we do not want anything added to or taken from it while we iterate. synchronized (scheduledSlots) { // read synchronized (runningSlots) { // read List<Integer> slotsToWorkOn = shardStateManager.getSlotStateManager(shard, g) .getSlotsEligibleForRollup(now, maxAgeMillis, rollupDelayForMetricsWithShortDelay, rollupWaitForMetricsWithLongDelay); if (slotsToWorkOn.size() == 0) { continue; } if (!canWorkOnShard(shard)) { continue; } for (Integer slot : slotsToWorkOn) { SlotKey slotKey = SlotKey.of(g, slot, shard); if (areChildKeysOrSelfKeyScheduledOrRunning(slotKey)) { continue; } SlotKey key = SlotKey.of(g, slot, shard); scheduledSlots.add(key); orderedScheduledSlots.add(key); recentlyScheduledShards.put(shard, scheduleTime); } } } } } } boolean isReroll(SlotKey slotKey) { return shardStateManager.getSlotStateManager(slotKey.getShard(), slotKey.getGranularity()) .isReroll(slotKey.getSlot(), scheduleTime); } boolean areChildKeysOrSelfKeyScheduledOrRunning(SlotKey slotKey) { // if any ineligible (children and self) keys are running or scheduled to run, we shouldn't work on this. Collection<SlotKey> ineligibleKeys = slotKey.getChildrenKeys(); if (runningSlots.keySet().contains(slotKey)) { return true; } if (scheduledSlots.contains(slotKey)) { return true; } // if any ineligible keys are running or scheduled to run, do not schedule this key. for (SlotKey childrenKey : ineligibleKeys) { if (runningSlots.keySet().contains(childrenKey)) { return true; } if (scheduledSlots.contains(childrenKey)) { return true; } } return false; } private boolean canWorkOnShard(int shard) { boolean canWork = lockManager.canWork(shard); if (!canWork) { if (log.isTraceEnabled()) log.trace("Skipping shard " + shard + " as lock could not be acquired"); } return canWork; } /** * Returns the next scheduled key. It has a few side effects: * 1) it resets update tracking for that slot * 2) it adds the key to the set of running rollups. * * @return */ @VisibleForTesting SlotKey getNextScheduled() { synchronized (scheduledSlots) { if (scheduledSlots.size() == 0) return null; synchronized (runningSlots) { SlotKey key = orderedScheduledSlots.remove(0); int slot = key.getSlot(); Granularity gran = key.getGranularity(); int shard = key.getShard(); // notice how we change the state, but the timestamp remained // the same. this is important. When the state is evaluated // (i.e., in Reader.getShardState()) we need to realize that // when timestamps are the same (this will happen), that a // remove always wins during the coalesce. scheduledSlots.remove(key); if (canWorkOnShard(shard)) { UpdateStamp stamp = shardStateManager.getSlotStateManager(shard, gran).getAndSetState(slot, UpdateStamp.State.Running); runningSlots.put(key, stamp.getTimestamp()); return key; } else { shardOwnershipChanged.mark(); return null; } } } } /** * Take the given slot out of the running group, and put it back into the * scheduled group. If {@code rescheduleImmediately} is true, the slot will * be the next slot returned by a call to {@link #getNextScheduled()}. If * {@code rescheduleImmediately} is false, then the given slot will go to * the end of the line, as when it was first scheduled by * {@link #scheduleEligibleSlots(long, long, long)}. * * @param key * @param rescheduleImmediately */ void pushBackToScheduled(SlotKey key, boolean rescheduleImmediately) { synchronized (scheduledSlots) { synchronized (runningSlots) { int slot = key.getSlot(); Granularity gran = key.getGranularity(); int shard = key.getShard(); // no need to set dirty/clean here. shardStateManager.getSlotStateManager(shard, gran).getAndSetState(slot, UpdateStamp.State.Active); scheduledSlots.add(key); log.debug("pushBackToScheduled -> added to scheduledSlots: " + key + " size:" + scheduledSlots.size()); if (rescheduleImmediately) { orderedScheduledSlots.add(0, key); } else { orderedScheduledSlots.add(key); } } } } /** * Remove the given slot from the running group after it has been * successfully re-rolled. * * @param slotKey */ void clearFromRunning(SlotKey slotKey) { synchronized (runningSlots) { runningSlots.remove(slotKey); UpdateStamp stamp = shardStateManager.getUpdateStamp(slotKey); shardStateManager.setAllCoarserSlotsDirtyForSlot(slotKey); //When state gets set to "X", before it got persisted, it might get scheduled for rollup //again, if we get delayed metrics. To prevent this we temporarily set last rollup time with current //time. This value wont get persisted. long currentTimeInMillis = clock.now().getMillis(); stamp.setLastRollupTimestamp(currentTimeInMillis); log.debug("SlotKey {} is marked in memory with last rollup time as {}", slotKey, currentTimeInMillis); // Update the stamp to Rolled state if and only if the current state // is running. If the current state is active, it means we received // a delayed put which toggled the status to Active. if (stamp.getState() == UpdateStamp.State.Running) { stamp.setState(UpdateStamp.State.Rolled); // Note: Rollup state will be updated to the last ACTIVE // timestamp which caused rollup process to kick in. stamp.setDirty(true); } } } /** * true if anything is scheduled. */ boolean hasScheduled() { return getScheduledCount() > 0; } /** * returns the number of scheduled rollups. */ int getScheduledCount() { synchronized (scheduledSlots) { return scheduledSlots.size(); } } /** * returns the number of currently running rollups. */ @VisibleForTesting int getRunningCount() { synchronized (runningSlots) { return runningSlots.size(); } } public Map<Integer, UpdateStamp> getSlotStamps(Granularity gran, int shard) { return shardStateManager.getSlotStateManager(shard, gran).getSlotStamps(); } // precondition: shard is unmanaged. void addShard(int shard) { shardStateManager.add(shard); lockManager.addShard(shard); } // precondition: shard is managed. void removeShard(int shard) { shardStateManager.remove(shard); lockManager.removeShard(shard); } Set<Integer> getRecentlyScheduledShards() { // Collections.unmodifiableSet(...) upsets JMX. return new TreeSet<Integer>(recentlyScheduledShards.asMap().keySet()); } /** * Normal {@link com.google.common.base.Ticker Ticker} behavior is to * return nanoseconds elapsed since VM started. This returns milliseconds * since the epoch based upon {@code ScheduleContext}'s internal * representation of time ({@link #scheduleTime}). * * @return an anonymous Ticker object */ // public Ticker asMillisecondsSinceEpochTicker() { return new Ticker() { @Override public long read() { return ScheduleContext.this.getCurrentTimeMillis(); } }; } @Override public Collection<String> getMetricsState(int shard, String gran, int slot) { final List<String> results = new ArrayList<String>(); Granularity granularity = Granularity.fromString(gran); if (granularity == null) return results; final Map<Integer, UpdateStamp> stateTimestamps = this.getSlotStamps(granularity, shard); if (stateTimestamps == null) return results; final UpdateStamp stamp = stateTimestamps.get(slot); if (stamp != null) { results.add(new SlotState(granularity, slot, stamp.getState()).withTimestamp(stamp.getTimestamp()).toString()); } return results; } private boolean isMbeanRegistered = false; private synchronized void registerMBean() { if (isMbeanRegistered) return; isMbeanRegistered = true; try { final MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); final String name = String.format("com.rackspacecloud.blueflood.io:type=%s", ScheduleContext.class.getSimpleName()); final ObjectName nameObj = new ObjectName(name); mbs.registerMBean(this, nameObj); } catch (Exception exc) { log.error("Unable to register mbean for " + ScheduleContext.class.getSimpleName(), exc); } } }