/* * Copyright 2013 Rackspace * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.rackspacecloud.blueflood.service; import com.codahale.metrics.*; import com.codahale.metrics.Timer; import com.google.common.annotations.VisibleForTesting; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.rackspacecloud.blueflood.concurrent.InstrumentedThreadPoolExecutor; import com.rackspacecloud.blueflood.concurrent.ThreadPoolBuilder; import com.rackspacecloud.blueflood.rollup.Granularity; import com.rackspacecloud.blueflood.rollup.SlotKey; import com.rackspacecloud.blueflood.tools.jmx.JmxBooleanGauge; import com.rackspacecloud.blueflood.utils.Metrics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.management.MBeanServer; import javax.management.ObjectName; import java.lang.management.ManagementFactory; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicLong; public class RollupService implements Runnable, RollupServiceMBean { private static final Logger log = LoggerFactory.getLogger(RollupService.class); private final Timer polltimer = Metrics.timer(RollupService.class, "Poll Timer"); private final Meter rejectedSlotChecks = Metrics.meter(RollupService.class, "Rejected Slot Checks"); private final long rollupDelayMillis; private final long rollupDelayForMetricsWithShortDelay; private final long rollupWaitForMetricsWithLongDelay; private transient Thread thread; private final ScheduleContext context; private final ShardStateManager shardStateManager; private final ThreadPoolExecutor locatorFetchExecutors; private final ThreadPoolExecutor rollupReadExecutors; private final ThreadPoolExecutor rollupWriteExecutors; private long pollerPeriod; private final long configRefreshInterval; private long lastSlotCheckFinishedAt = 0L; private boolean active = true; private boolean keepingServerTime = true; private Gauge activeGauge; private Gauge inflightRollupGauge; private Gauge pollerPeriodGauge; private Gauge serverTimeGauge; private Gauge rollupConcurrencyGauge; private Gauge scheduledSlotCheckGauge; private Gauge secondsSinceLastSlotCheckGauge; private Gauge queuedRollupGauge; private Gauge slotCheckConcurrencyGauge; private Gauge recentlyScheduledShardGauge; private Gauge managedShardGauge; protected static final AtomicLong lastRollupTime = new AtomicLong(System.currentTimeMillis()); private static final Gauge<Long> timeSinceLastRollupGauge; static { timeSinceLastRollupGauge = new Gauge<Long>() { @Override public Long getValue() { return System.currentTimeMillis() - lastRollupTime.get(); } }; Metrics.getRegistry().register(MetricRegistry.name(RollupService.class, "Milliseconds Since Last Rollup"), timeSinceLastRollupGauge); } public RollupService(ScheduleContext context) { pollerPeriod = Configuration.getInstance().getIntegerProperty(CoreConfig.SCHEDULE_POLL_PERIOD); configRefreshInterval = Configuration.getInstance().getIntegerProperty(CoreConfig.CONFIG_REFRESH_PERIOD); this.context = context; this.shardStateManager = context.getShardStateManager(); // NOTE: higher locatorFetchConcurrency means that the queue used in rollupReadExecutors needs to be correspondingly // higher. Configuration config = Configuration.getInstance(); rollupDelayMillis = config.getLongProperty(CoreConfig.ROLLUP_DELAY_MILLIS); rollupDelayForMetricsWithShortDelay = config.getLongProperty(CoreConfig.SHORT_DELAY_METRICS_ROLLUP_DELAY_MILLIS); rollupWaitForMetricsWithLongDelay = config.getLongProperty(CoreConfig.LONG_DELAY_METRICS_ROLLUP_WAIT_MILLIS); log.info(String.format("Delay configs -> ROLLUP_DELAY_MILLIS: [%d] SHORT_DELAY_METRICS_ROLLUP_DELAY_MILLIS: [%d] " + "LONG_DELAY_METRICS_ROLLUP_WAIT_MILLIS: [%d]", rollupDelayMillis, rollupDelayForMetricsWithShortDelay, rollupWaitForMetricsWithLongDelay)); ThreadFactory locatorFetchThreadFactory = new ThreadFactoryBuilder().setNameFormat("locator-fetcher-%d").build(); final int locatorFetchConcurrency = config.getIntegerProperty(CoreConfig.MAX_LOCATOR_FETCH_THREADS); ThreadPoolExecutor _locatorFetchExecutors = new ThreadPoolExecutor( locatorFetchConcurrency, locatorFetchConcurrency, 30, TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(locatorFetchConcurrency * 5), locatorFetchThreadFactory, new RejectedExecutionHandler() { public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) { // in this case, we want to throw a RejectedExecutionException so that the slot can be removed // from the running queue. throw new RejectedExecutionException("Threadpool is saturated. unable to service this slot."); } } ) { @Override protected void afterExecute(Runnable r, Throwable t) { lastSlotCheckFinishedAt = RollupService.this.context.getCurrentTimeMillis(); super.afterExecute(r, t); } }; // unbounded work queue. ThreadFactory rollupReaderThreadFactory = new ThreadFactoryBuilder().setNameFormat("rollup-reader-%d").build(); final BlockingQueue<Runnable> rollupReadQueue = new LinkedBlockingQueue<Runnable>(); ThreadPoolExecutor _rollupReadExecutors = new ThreadPoolExecutor( // "RollupReadsThreadpool", config.getIntegerProperty(CoreConfig.MAX_ROLLUP_READ_THREADS), config.getIntegerProperty(CoreConfig.MAX_ROLLUP_READ_THREADS), 30, TimeUnit.SECONDS, rollupReadQueue, rollupReaderThreadFactory, new ThreadPoolExecutor.AbortPolicy() ); ThreadFactory rollupWriterThreadFactory = new ThreadFactoryBuilder().setNameFormat("rollup-writer-%d").build(); final BlockingQueue<Runnable> rollupWriteQueue = new LinkedBlockingQueue<Runnable>(); ThreadPoolExecutor _rollupWriteExecutors = new ThreadPoolExecutor( // "RollupWritesThreadpool", config.getIntegerProperty(CoreConfig.MAX_ROLLUP_WRITE_THREADS), config.getIntegerProperty(CoreConfig.MAX_ROLLUP_WRITE_THREADS), 30, TimeUnit.SECONDS, rollupWriteQueue, rollupWriterThreadFactory, new ThreadPoolExecutor.AbortPolicy() ); initializeGauges(); locatorFetchExecutors = _locatorFetchExecutors; InstrumentedThreadPoolExecutor.instrument(locatorFetchExecutors, "LocatorFetchThreadPool"); rollupReadExecutors = _rollupReadExecutors; InstrumentedThreadPoolExecutor.instrument(rollupReadExecutors, "RollupReadsThreadpool"); rollupWriteExecutors = _rollupWriteExecutors; InstrumentedThreadPoolExecutor.instrument(rollupWriteExecutors, "RollupWritesThreadpool"); } @VisibleForTesting public RollupService(ScheduleContext context, ShardStateManager shardStateManager, ThreadPoolExecutor locatorFetchExecutors, ThreadPoolExecutor rollupReadExecutors, ThreadPoolExecutor rollupWriteExecutors, long rollupDelayMillis, long rollupDelayForMetricsWithShortDelay, long rollupWaitForMetricsWithLongDelay, long pollerPeriod, long configRefreshInterval) { this.context = context; this.shardStateManager = shardStateManager; this.locatorFetchExecutors = locatorFetchExecutors; this.rollupReadExecutors = rollupReadExecutors; this.rollupWriteExecutors = rollupWriteExecutors; this.rollupDelayMillis = rollupDelayMillis; this.rollupDelayForMetricsWithShortDelay = rollupDelayForMetricsWithShortDelay; this.rollupWaitForMetricsWithLongDelay = rollupWaitForMetricsWithLongDelay; this.pollerPeriod = pollerPeriod; this.configRefreshInterval = configRefreshInterval; initializeGauges(); } public void initializeGauges() { try { final MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); final String name = String.format("com.rackspacecloud.blueflood.service:type=%s", getClass().getSimpleName()); final ObjectName nameObj = new ObjectName(name); mbs.registerMBean(this, nameObj); MetricRegistry reg = Metrics.getRegistry(); activeGauge = reg.register(MetricRegistry.name(RollupService.class, "Active"), new JmxBooleanGauge(nameObj, "Active")); inflightRollupGauge = reg.register(MetricRegistry.name(RollupService.class, "In Flight Rollup Count"), new JmxAttributeGauge(nameObj, "InFlightRollupCount")); pollerPeriodGauge = reg.register(MetricRegistry.name(RollupService.class, "Poller Period"), new JmxAttributeGauge(nameObj, "PollerPeriod")); queuedRollupGauge = reg.register(MetricRegistry.name(RollupService.class, "Queued Rollup Count"), new JmxAttributeGauge(nameObj, "QueuedRollupCount")); rollupConcurrencyGauge = reg.register(MetricRegistry.name(RollupService.class, "Rollup Concurrency"), new JmxAttributeGauge(nameObj, "RollupConcurrency")); scheduledSlotCheckGauge = reg.register(MetricRegistry.name(RollupService.class, "Scheduled Slot Check"), new JmxAttributeGauge(nameObj, "ScheduledSlotCheckCount")); secondsSinceLastSlotCheckGauge = reg.register(MetricRegistry.name(RollupService.class, "Seconds Since Last Slot Check"), new JmxAttributeGauge(nameObj, "SecondsSinceLastSlotCheck")); serverTimeGauge = reg.register(MetricRegistry.name(RollupService.class, "Server Time"), new JmxAttributeGauge(nameObj, "ServerTime")); slotCheckConcurrencyGauge = reg.register(MetricRegistry.name(RollupService.class, "Slot Check Concurrency"), new JmxAttributeGauge(nameObj, "SlotCheckConcurrency")); recentlyScheduledShardGauge = reg.register(MetricRegistry.name(RollupService.class, "Recently Scheduled Shards"), new Gauge<Integer>() { @Override public Integer getValue() { return getRecentlyScheduledShards().size(); } }); managedShardGauge = reg.register(MetricRegistry.name(RollupService.class, "Managed Shards"), new Gauge<Integer>() { @Override public Integer getValue() { return getManagedShards().size(); } }); } catch (Exception exc) { log.error("Unable to register mbean for " + getClass().getSimpleName(), exc); } } public void forcePoll() { thread.interrupt(); } final void poll() { Timer.Context timer = polltimer.time(); // schedule for rollup anything that has not been updated in ROLLUP_DELAY_SECS context.scheduleEligibleSlots(rollupDelayMillis, rollupDelayForMetricsWithShortDelay, rollupWaitForMetricsWithLongDelay); timer.stop(); } public void run() { thread = Thread.currentThread(); while (shouldKeepRunning()) { long startRun = System.currentTimeMillis(); poll(); // if there are schedules slots, run what we can. boolean rejected = false; while (context.hasScheduled() && !rejected && active) { final SlotKey slotKey = context.getNextScheduled(); if (slotKey == null) { continue; } try { UpdateStamp stamp = shardStateManager.getUpdateStamp(slotKey); long currentTimeMillis = context.getCurrentTimeMillis(); boolean isReroll = context.isReroll(slotKey); log.info("Scheduling slotKey {} @ {} last collection time: {} last rollup time: {} isReroll: {}", new Object[]{slotKey, currentTimeMillis, stamp.getTimestamp(), stamp.getLastRollupTimestamp(), isReroll}); locatorFetchExecutors.execute(new LocatorFetchRunnable(context, slotKey, rollupReadExecutors, rollupWriteExecutors)); } catch (RejectedExecutionException ex) { // puts it back at the top of the list of scheduled slots. When this happens it means that // there is too much rollup work to do. if the CPU cores are not tapped out, it means you don't // have enough threads allocated to processing rollups or slot checks. rejectedSlotChecks.mark(); context.pushBackToScheduled(slotKey, true); rejected = true; } } long endRun = System.currentTimeMillis(); if (endRun - startRun > pollerPeriod) log.error("It took longer than {} to poll for rollups.", pollerPeriod); else try { thread.sleep(Math.max(0, pollerPeriod - endRun + startRun)); } catch (Exception ex) { log.debug("RollupService poller woke up"); } } } private boolean keepRunning = true; @VisibleForTesting boolean shouldKeepRunning() { return keepRunning; } @VisibleForTesting void setShouldKeepRunning(boolean value) { keepRunning = value; } // // JMX exposure // // set the server time in millis. public synchronized void setServerTime(long millis) { log.info("Manually setting server time to {} {}", millis, new java.util.Date(millis)); context.setCurrentTimeMillis(millis); } // get the server time in seconds. public synchronized long getServerTime() { return context.getCurrentTimeMillis(); } public synchronized void setKeepingServerTime(boolean b) { keepingServerTime = b; } public synchronized boolean getKeepingServerTime() { return keepingServerTime; } public synchronized void setPollerPeriod(long l) { // todo: alter the design so that you don't have to keep a thread reference around. one way to do this is to // override the function in the caller (where the thread is accessible). pollerPeriod = l; if (thread != null) thread.interrupt(); } public synchronized long getPollerPeriod() { return pollerPeriod; } public synchronized int getScheduledSlotCheckCount() { return context.getScheduledCount(); } public synchronized int getSecondsSinceLastSlotCheck() { return (int)((context.getCurrentTimeMillis() - lastSlotCheckFinishedAt) / 1000); } public synchronized int getSlotCheckConcurrency() { return locatorFetchExecutors.getMaximumPoolSize(); } public synchronized void setSlotCheckConcurrency(int i) { locatorFetchExecutors.setCorePoolSize(i); locatorFetchExecutors.setMaximumPoolSize(i); } public synchronized int getRollupConcurrency() { return rollupReadExecutors.getMaximumPoolSize(); } public synchronized void setRollupConcurrency(int i) { rollupReadExecutors.setCorePoolSize(i); rollupReadExecutors.setMaximumPoolSize(i); } public synchronized int getQueuedRollupCount() { return rollupReadExecutors.getQueue().size(); } public synchronized int getInFlightRollupCount() { return rollupReadExecutors.getActiveCount(); } public synchronized boolean getActive() { return active; } public synchronized void setActive(boolean b) { active = b; if (active && thread != null) thread.interrupt(); } /** * Add a shard to be managed (via JMX) * * @param shard shard to be added */ public void addShard(Integer shard) { if (!shardStateManager.getManagedShards().contains(shard)) context.addShard(shard); } /** * Remove a shard from being managed (via JMX) * * @param shard shard to be removed */ public void removeShard(Integer shard) { if (shardStateManager.getManagedShards().contains(shard)) context.removeShard(shard); } /** * Get list of managed shards (via JMX) * * @return list of managed shards (unmodifiable collection) */ public Collection<Integer> getManagedShards() { return new TreeSet<Integer>(shardStateManager.getManagedShards()); } public synchronized Collection<Integer> getRecentlyScheduledShards() { // note: already sorted when it comes from the context. return context.getRecentlyScheduledShards(); } public synchronized Collection<String> getOldestUnrolledSlotPerGranularity(int shard) { final Set<String> results = new HashSet<String>(); for (Granularity g : Granularity.rollupGranularities()) { final Map<Integer, UpdateStamp> stateTimestamps = context.getSlotStamps(g, shard); if (stateTimestamps == null || stateTimestamps.isEmpty()) { continue; } // Iterate through the map of slot to UpdateStamp and find the oldest one SlotState minSlot = new SlotState().withTimestamp(System.currentTimeMillis()); boolean add = false; for (Map.Entry<Integer, UpdateStamp> entry : stateTimestamps.entrySet()) { final UpdateStamp stamp = entry.getValue(); if (stamp.getState() != UpdateStamp.State.Rolled && stamp.getTimestamp() < minSlot.getTimestamp()) { minSlot = new SlotState(g, entry.getKey(), stamp.getState()).withTimestamp(stamp.getTimestamp()); add = true; } } if (add) { results.add(minSlot.toString()); } } return results; } }