/* * Copyright 2013 Rackspace * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.rackspacecloud.blueflood.service; import com.codahale.metrics.Histogram; import com.codahale.metrics.Meter; import com.google.common.base.Ticker; import com.rackspacecloud.blueflood.exceptions.GranularityException; import com.rackspacecloud.blueflood.rollup.Granularity; import com.rackspacecloud.blueflood.rollup.SlotKey; import com.rackspacecloud.blueflood.utils.Clock; import com.rackspacecloud.blueflood.utils.DefaultClockImpl; import com.rackspacecloud.blueflood.utils.Metrics; import com.rackspacecloud.blueflood.utils.Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; public class ShardStateManager { private static final Logger log = LoggerFactory.getLogger(ShardStateManager.class); private static final Set<Integer> ALL_SHARDS = new HashSet<Integer>(Util.parseShards("ALL")); final Set<Integer> shards; // Managed shards final Map<Integer, ShardToGranularityMap> shardToGranularityStates = new HashMap<Integer, ShardToGranularityMap>(); private final Ticker serverTimeMillisecondTicker; private static final long millisInADay = 24 * 60 * 60 * 1000; private static final Histogram timeSinceUpdate = Metrics.histogram(RollupService.class, "Shard Slot Time Elapsed scheduleEligibleSlots"); // todo: CM_SPECIFIC verify changing metric class name doesn't break things. private static final Meter updateStampMeter = Metrics.meter(ShardStateManager.class, "Shard Slot Update Meter"); private final Meter parentBeforeChild = Metrics.meter(RollupService.class, "Parent slot executed before child"); private static final Map<Granularity, Meter> granToReRollMeters = new HashMap<Granularity, Meter>(); private static final Map<Granularity, Meter> reRollForShortDelayMetricsMeters = new HashMap<Granularity, Meter>(); private static final Map<Granularity, Meter> reRollForLongDelayMetricsMeters = new HashMap<Granularity, Meter>(); private static final Map<Granularity, Meter> granToDelayedMetricsMeter = new HashMap<Granularity, Meter>(); // If there are no delayed metrics, a slot should only get rolled up again after 14 days since its last rollup. Since we only allow // delayed data that is BEFORE_CURRENT_COLLECTIONTIME_MS (typically 3 days) old, we are assuming that if rollup happens again within // that time, its a re-roll because of delayed data. public static final long REROLL_TIME_SPAN_ASSUMED_VALUE = Configuration.getInstance().getLongProperty( CoreConfig.BEFORE_CURRENT_COLLECTIONTIME_MS ); private final Clock clock; static { for (Granularity rollupGranularity : Granularity.rollupGranularities()) { granToReRollMeters.put(rollupGranularity, Metrics.meter(RollupService.class, String.format("%s Re-rolling up because of delayed metrics", rollupGranularity.shortName()))); granToDelayedMetricsMeter.put(rollupGranularity, Metrics.meter(RollupService.class, String.format("Delayed metric that has a danger of TTLing", rollupGranularity.shortName()))); reRollForShortDelayMetricsMeters.put(rollupGranularity, Metrics.meter(RollupService.class, String.format("%s Slots waiting to be re-rolled because of short delay metrics", rollupGranularity.shortName()))); reRollForLongDelayMetricsMeters.put(rollupGranularity, Metrics.meter(RollupService.class, String.format("%s Slots waiting to be re-rolled because of long delay metrics", rollupGranularity.shortName()))); } } protected ShardStateManager(Collection<Integer> shards, Ticker ticker) { this(shards, ticker, new DefaultClockImpl()); } protected ShardStateManager(Collection<Integer> shards, Ticker ticker, Clock clock) { this.shards = new HashSet<Integer>(shards); for (Integer shard : ALL_SHARDS) { // Why not just do this for managed shards? shardToGranularityStates.put(shard, new ShardToGranularityMap(shard)); } this.serverTimeMillisecondTicker = ticker; this.clock = clock; } protected Collection<Integer> getManagedShards() { return Collections.unmodifiableCollection(this.shards); } protected Boolean contains(int shard) { return shards.size() != 0 && shards.contains(shard); } protected void add(int shard) { if (contains(shard)) return; shards.add(shard); } protected void remove(int shard) { if (!contains(shard)) return; this.shards.remove(shard); } public SlotStateManager getSlotStateManager(int shard, Granularity granularity) { return shardToGranularityStates.get(shard).granularityToSlots.get(granularity); } protected UpdateStamp getUpdateStamp(SlotKey slotKey) { SlotStateManager slotStateManager = this.getSlotStateManager(slotKey.getShard(), slotKey.getGranularity()); UpdateStamp stamp = slotStateManager.slotToUpdateStampMap.get(slotKey.getSlot()); return stamp; } // Side effect: mark dirty slots as clean protected Map<Granularity, Map<Integer, UpdateStamp>> getDirtySlotsToPersist(int shard) { Map<Granularity, Map<Integer, UpdateStamp>> slotTimes = new HashMap<Granularity, Map<Integer, UpdateStamp>>(); int numUpdates = 0; for (Granularity gran : Granularity.rollupGranularities()) { Map<Integer, UpdateStamp> dirty = getSlotStateManager(shard, gran).getDirtySlotStampsAndMarkClean(); slotTimes.put(gran, dirty); if (dirty.size() > 0) { numUpdates += dirty.size(); } } if (numUpdates > 0) { // for updates that come by way of scribe, you'll typically see 5 as the number of updates (one for // each granularity). On rollup slaves the situation is a bit different. You'll see only the slot // of the granularity just written to marked dirty (so 1). log.debug("Found {} dirty slots for shard {}", new Object[]{numUpdates, shard}); return slotTimes; } return null; } public void updateSlotOnRead(int shard, SlotState slotState) { getSlotStateManager(shard, slotState.getGranularity()).updateSlotOnRead(slotState); } public void setAllCoarserSlotsDirtyForSlot(SlotKey slotKey) { boolean done = false; Granularity coarserGran = slotKey.getGranularity(); int coarserSlot = slotKey.getSlot(); while (!done) { try { coarserGran = coarserGran.coarser(); coarserSlot = coarserGran.slotFromFinerSlot(coarserSlot); ConcurrentMap<Integer, UpdateStamp> updateStampsBySlotMap = getSlotStateManager(slotKey.getShard(), coarserGran).slotToUpdateStampMap; UpdateStamp coarseSlotStamp = updateStampsBySlotMap.get(coarserSlot); if (coarseSlotStamp == null) { log.debug("No stamp for coarser slot: {}; supplied slot: {}", SlotKey.of(coarserGran, coarserSlot, slotKey.getShard()), slotKey); updateStampsBySlotMap.putIfAbsent(coarserSlot, new UpdateStamp(serverTimeMillisecondTicker.read(), UpdateStamp.State.Active, true)); continue; } UpdateStamp.State coarseSlotState = coarseSlotStamp.getState(); if (coarseSlotState != UpdateStamp.State.Active) { parentBeforeChild.mark(); log.debug("Coarser slot not in active state when finer slot {} just got rolled up. Marking coarser slot {} dirty.", slotKey, SlotKey.of(coarserGran, coarserSlot, slotKey.getShard())); coarseSlotStamp.setState(UpdateStamp.State.Active); coarseSlotStamp.setDirty(true); coarseSlotStamp.setTimestamp(serverTimeMillisecondTicker.read()); } } catch (GranularityException ex) { done = true; } } } private class ShardToGranularityMap { final int shard; final Map<Granularity, SlotStateManager> granularityToSlots = new HashMap<Granularity, SlotStateManager>(); protected ShardToGranularityMap(int shard) { this.shard = shard; for (Granularity granularity : Granularity.rollupGranularities()) { granularityToSlots.put(granularity, new SlotStateManager(shard, granularity)); } } } protected class SlotStateManager { private final int shard; final Granularity granularity; final ConcurrentMap<Integer, UpdateStamp> slotToUpdateStampMap; protected SlotStateManager(int shard, Granularity granularity) { this.shard = shard; this.granularity = granularity; slotToUpdateStampMap = new ConcurrentHashMap<Integer, UpdateStamp>(granularity.numSlots()); } /** * * Imagine metrics are flowing in from multiple ingestor nodes. The * ingestion path updates schedule context while writing metrics to * cassandra.(See * {@link com.rackspacecloud.blueflood.inputs.processors.BatchWriter BatchWriter}). * We cannot make any ordering guarantees on the metrics. So every * metric that comes in updates the slot state to its collection time. * <p> * * This state gets pushed in cassandra by {@link ShardStatePusher} and read on * the rollup slave. Rollup slave is going to update its state to * {@link com.rackspacecloud.blueflood.service.UpdateStamp.State#Active ACTIVE} * as long as the timestamp does not match. Rollup slave shard * map can be in 3 states: * 1) {@link com.rackspacecloud.blueflood.service.UpdateStamp.State#Active Active} * 2) {@link com.rackspacecloud.blueflood.service.UpdateStamp.State#Rolled Rolled} * 3) {@link com.rackspacecloud.blueflood.service.UpdateStamp.State#Running Running}. * Every {@code ACTIVE} update is taken for Rolled and Running states, * but if the shard map is already in an {@code ACTIVE} state, then the * update happens only if the timestamp of update coming in is greater * than what we have. On Rollup slave it means eventually when it rolls * up data for the {@code ACTIVE} slot, it will be marked with the * collection time belonging to a metric which was generated later. * <p> * * For a case of multiple ingestors, it means eventually higher * timestamp will win, and will be updated even if that ingestor did * not receive metric with that timestamp and will stop triggering the * state to {@code ACTIVE} on rollup host. After this convergence is * reached the last rollup time match with the last active times on all * ingestor nodes. * * LastUpdateTime of an active slot is used as last ingest time and lastUpdateTime * of a rolled slot is used as last rollup time. UpdateStamp which is in memory * for each slot has the last ingest time and last rollup time of that slot. */ protected void updateSlotOnRead(SlotState slotState) { final int slot = slotState.getSlot(); final long timestamp = slotState.getTimestamp(); UpdateStamp.State state = slotState.getState(); //For slots in state "A", this would be last ingest time //For slots in state "X", this would be last rollup time final long lastUpdateTimestamp = slotState.getLastUpdatedTimestamp(); UpdateStamp stampInMemory = slotToUpdateStampMap.get(slot); if (stampInMemory == null) { // haven't seen this slot before, take the update. This happens when a blueflood service is just started. slotToUpdateStampMap.put(slot, new UpdateStamp(timestamp, state, false, 0, lastUpdateTimestamp)); } else if (stampInMemory.getTimestamp() != timestamp && state.equals(UpdateStamp.State.Active)) { // 1) new update coming in. We can be in 3 states 1) Active 2) Rolled 3) Running. Apply the update in all cases except when we are already active and // the triggering timestamp we have is greater or the stampInMemory is yet to be persisted i.e still dirty // This "if" is equivalent to: // if (current is not active) || (current is older && clean) if (!(stampInMemory.getState().equals(UpdateStamp.State.Active) && (stampInMemory.getTimestamp() > timestamp || stampInMemory.isDirty()))) { slotToUpdateStampMap.put(slot, new UpdateStamp(timestamp, state, false, stampInMemory.getLastRollupTimestamp(), lastUpdateTimestamp)); } else { // keep rewriting the newer timestamp, in case it has been overwritten: stampInMemory.setDirty(true); // This is crucial for convergence, we need to superimpose a higher timestamp which can be done only if we set it to dirty } } else if (stampInMemory.getTimestamp() == timestamp && state.equals(UpdateStamp.State.Rolled)) { // 2) if current value is same but value being applied is a remove, remove wins. stampInMemory.setState(UpdateStamp.State.Rolled); //For incoming update(from metrics_state) of "Rolled" status, we use its last updated time as the last rollup time. if (lastUpdateTimestamp > stampInMemory.getLastRollupTimestamp()) stampInMemory.setLastRollupTimestamp(lastUpdateTimestamp); } else if (state.equals(UpdateStamp.State.Rolled)) { //For incoming update(from metrics_state) of "Rolled" status, we use its last updated time as the last rollup time. if (lastUpdateTimestamp > stampInMemory.getLastRollupTimestamp()) stampInMemory.setLastRollupTimestamp(lastUpdateTimestamp); } } protected void createOrUpdateForSlotAndMillisecond(int slot, long millis) { long nowMillis = clock.now().getMillis(); if (slotToUpdateStampMap.containsKey(slot)) { UpdateStamp stamp = slotToUpdateStampMap.get(slot); stamp.setTimestamp(millis); // Temporarily setting last ingest time to current time until we get more accurate value from db. // This will not be persisted. stamp.setLastIngestTimestamp(nowMillis); stamp.setState(UpdateStamp.State.Active); stamp.setDirty(true); } else { slotToUpdateStampMap.put(slot, new UpdateStamp(millis, UpdateStamp.State.Active, true, 0, nowMillis)); } updateStampMeter.mark(); } protected Map<Integer, UpdateStamp> getDirtySlotStampsAndMarkClean() { HashMap<Integer, UpdateStamp> dirtySlots = new HashMap<Integer, UpdateStamp>(); for (Map.Entry<Integer, UpdateStamp> entry : slotToUpdateStampMap.entrySet()) { if (entry.getValue().isDirty()) { dirtySlots.put(entry.getKey(), entry.getValue()); entry.getValue().setDirty(false); } } return dirtySlots; } protected UpdateStamp getAndSetState(int slot, UpdateStamp.State state) { UpdateStamp stamp = slotToUpdateStampMap.get(slot); stamp.setState(state); return stamp; } // gets a snapshot of the last updates protected Map<Integer, UpdateStamp> getSlotStamps() { // essentially a copy on read map. return Collections.unmodifiableMap(slotToUpdateStampMap); } /** * Determines if a slot is being re-rolled or not. * * Since we only allow delayed metrics upto 3 days(BEFORE_CURRENT_COLLECTIONTIME_MS), a slot can be * identified as being re-rolled, if the last rollup is within those last 3 days. * * @param slot * @param now * @return */ protected boolean isReroll(int slot, long now) { final UpdateStamp updateStamp = slotToUpdateStampMap.get(slot); final long timeElapsedSinceLastRollup = now - updateStamp.getLastRollupTimestamp(); if (updateStamp.getLastRollupTimestamp() > 0 && timeElapsedSinceLastRollup < REROLL_TIME_SPAN_ASSUMED_VALUE) { return true; } return false; } /** * A slot will become eligible for rollup/re-roll based on the below three configs. * * 1) 1st rollup -> Eligible after ROLLUP_DELAY_MILLIS from collection time. * 2) 1st re-roll -> This happens for metrics with short delay (within SHORT_DELAY_METRICS_ROLLUP_DELAY_MILLIS). * Eligible after SHORT_DELAY_METRICS_ROLLUP_DELAY_MILLIS from collection time. * 3) nth re-roll -> This happens for metrics with long delay(more than SHORT_DELAY_METRICS_ROLLUP_DELAY_MILLIS). * Eligible after LONG_DELAY_METRICS_ROLLUP_WAIT_MILLIS from last ingest time. * This re-roll repeats as we keep getting delayed metrics. * * * |<-- SHORT_DELAY_METRICS_ROLLUP_DELAY--->| * | | * |<-- ROLLUP_DELAY--->| | |<--ROLLUP_WAIT-->| |<--ROLLUP_WAIT-->| * | | | | | | | * ------------------------------------------------------------------------------------------------------------- * |slot| ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ * | X | | | | | | | | | | | * | delayed metric | delayed metric | delayed metrics | * | | | | * 1st rollup 1st re-roll 2nd re-roll 3rd re-roll * * * @param now is current time * @param maxAgeMillis is ROLLUP_DELAY_MILLIS * @param rollupDelayForMetricsWithShortDelay is SHORT_DELAY_METRICS_ROLLUP_DELAY_MILLIS * @param rollupWaitForMetricsWithLongDelay is LONG_DELAY_METRICS_ROLLUP_WAIT_MILLIS * @return list of slots that are eligible for rollup */ protected List<Integer> getSlotsEligibleForRollup(long now, long maxAgeMillis, long rollupDelayForMetricsWithShortDelay, long rollupWaitForMetricsWithLongDelay) { List<Integer> outputKeys = new ArrayList<Integer>(); long nowMillis = clock.now().getMillis(); for (Map.Entry<Integer, UpdateStamp> entry : slotToUpdateStampMap.entrySet()) { final int slot = entry.getKey(); final UpdateStamp update = entry.getValue(); final long timeElapsed = now - update.getTimestamp(); timeSinceUpdate.update(timeElapsed); if (update.getState() == UpdateStamp.State.Rolled) { continue; } if (timeElapsed <= maxAgeMillis) { continue; } //Handling re-rolls: if (isReroll(slot, now)) { SlotKey slotKey = SlotKey.of(granularity, entry.getKey(), shard); //short delay if (timeElapsed <= rollupDelayForMetricsWithShortDelay) { reRollForShortDelayMetricsMeters.get(granularity).mark(); log.debug(String.format("Short delay: Delaying re-roll of slotKey [%s] as [%d] millis " + "haven't elapsed since collection time:[%d] now: [%d] time elapsed: [%d] last " + "rollup time: [%d]", slotKey, rollupDelayForMetricsWithShortDelay, update.getTimestamp(), now, timeElapsed, update.getLastRollupTimestamp())); continue; } if (update.getLastIngestTimestamp() > 0 ) { long delayOfLastIngestedMetric = update.getLastIngestTimestamp() - update.getTimestamp(); final long timeElapsedSinceLastIngest = now - update.getLastIngestTimestamp(); //long delay if (delayOfLastIngestedMetric > rollupDelayForMetricsWithShortDelay && timeElapsedSinceLastIngest <= rollupWaitForMetricsWithLongDelay) { reRollForLongDelayMetricsMeters.get(granularity).mark(); log.debug(String.format("Long delay: Delaying re-roll of slotKey [%s] as we received " + "delayed metrics within the last [%d] millis with rollup_wait of [%d] millis. last " + "ingest time: [%d]", slotKey, timeElapsedSinceLastIngest, rollupWaitForMetricsWithLongDelay, update.getLastIngestTimestamp())); continue; } } granToReRollMeters.get(granularity).mark(); if (nowMillis - update.getTimestamp() >= millisInADay) { granToDelayedMetricsMeter.get(granularity).mark(); } } outputKeys.add(entry.getKey()); } return outputKeys; } } }