/* * Copyright MapR Technologies, $year * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mapr; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.tuple.Fields; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.mapr.storm.DirectoryScanner; import com.mapr.storm.PendingMessage; import com.mapr.storm.SpoutState; import com.mapr.storm.streamparser.StreamParser; import com.mapr.storm.streamparser.StreamParserFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Pattern; /** * This is a spout that reads records from files. New records can be appended to * these files while this spout is running. This will cause these new records to * be emitted the next time around the event loop. It is also possible that we * will hit the end of a file of records and notice that a new file has been * started. In that case, we will move to that new file and start emitting the * records we read from that file. * <p/> * There are two modes in which a TailSpout can operate. In unreliable mode, * transactions are read, but ack's and fail's are ignored so that if * transactions are not processed, they are lost. The TailSpout will checkpoint * it's own progress periodically and on deactivation so that when the TailSpout * is restarted, it will start from close to where it left off. The restart will * be exact with a clean shutdown and can be made arbitrarily precise with an * unclean shutdown. * <p/> * In the reliable mode, an in-memory table of unacknowledged transactions is * kept. Any that fail are retransmitted and those that succeed are removed from * the table. When a checkpoint is done, the file name and offset of the * earliest unacknowledged transaction in that file will be retained. On * restart, each of the files with unacknowledged transactions will be replayed * from that point. Usually only one file will be replaced (the current one) or * possibly the current and previous file. This will generally replay a few * extra transactions, but if there is a cool-down period on shutdown, this will * should very few extra transactions transmitted on startup. * <p/> * One possible extension would be to allow the system to run in reliable mode, * but not save the offsets for unacked transactions. This would be good during * orderly shutdowns, but very bad in the event of an unorderly shutdown. */ public class TailSpout extends BaseRichSpout { private static final long serialVersionUID = -3911379881516049619L; private final Logger log = LoggerFactory.getLogger(TailSpout.class); private DirectoryScanner scanner; private FileInputStream currentInput = null; private AtomicBoolean replayFailedMessages = new AtomicBoolean(true); // these are set in the constructors private StreamParserFactory factory; private File statusFile; // all others are created on the fly private Map<Long, PendingMessage> ackBuffer = Maps.newTreeMap(); private long messageId = 0; // how often should we save our state? private long tuplesPerCheckpoint = 100; // time between forced checkpoints in seconds private double checkPointIntervalSeconds = 1.0; private Queue<PendingMessage> pendingReplays = Lists.newLinkedList(); private StreamParser parser = null; private SpoutOutputCollector collector; private double nextCheckPointTime = 0; public TailSpout(StreamParserFactory factory, File statusFile) throws IOException { this.factory = factory; scanner = SpoutState.restoreState(pendingReplays, statusFile); } public TailSpout(StreamParserFactory factory, File statusFile, File inputDirectory, final Pattern inputFileNamePattern) throws IOException { this.factory = factory; this.statusFile = statusFile; if (statusFile.exists() && statusFile.length() > 0) { scanner = SpoutState.restoreState(pendingReplays, statusFile); } else { scanner = new DirectoryScanner(inputDirectory, inputFileNamePattern); } } public boolean setReliableMode(boolean replayFailedMessages) { return this.replayFailedMessages.getAndSet(replayFailedMessages); } public void setTuplesPerCheckpoint(long tuplesPerCheckpoint) { this.tuplesPerCheckpoint = tuplesPerCheckpoint; } public void setCheckPointIntervalSeconds(double checkPointIntervalSeconds) { this.checkPointIntervalSeconds = checkPointIntervalSeconds; } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { outputFieldsDeclarer.declare(new Fields(factory.getOutputFields())); } @SuppressWarnings("rawtypes") @Override public void open(Map map, TopologyContext context, SpoutOutputCollector collector) { this.collector = collector; } @Override public void close() { SpoutState.recordCurrentState(ackBuffer, scanner, parser, statusFile); } @Override public void deactivate() { SpoutState.recordCurrentState(ackBuffer, scanner, parser, statusFile); } @Override public void nextTuple() { if (currentInput == null) { currentInput = openNextInput(); } try { if (currentInput != null) { // read a record long position = parser.currentOffset(); List<Object> r = parser.nextRecord(); // assert currentInput != null if (r == null) { // reached end of current file. Try just once to // open next input (if it exists) and read again. // if we still get nothing, we stay with the current // file currentInput = openNextInput(); // assert r == null if (currentInput != null) { position = parser.currentOffset(); r = parser.nextRecord(); } } if (r != null) { if (replayFailedMessages.get()) { collector.emit(r, messageId); ackBuffer.put(messageId, new PendingMessage(scanner.getLiveFile(), position, r)); messageId++; } else { collector.emit(r); } // persist current reading state if it is time if (messageId % tuplesPerCheckpoint == 0 || System.nanoTime() / 1e9 > nextCheckPointTime) { SpoutState.recordCurrentState(ackBuffer, scanner, parser, statusFile); nextCheckPointTime = System.nanoTime() / 1e9 + checkPointIntervalSeconds; } } } // exit after emitting a tuple or when end of current input is found } catch (IOException e) { throw new RuntimeException(e); } } private FileInputStream openNextInput() { PendingMessage next = pendingReplays.poll(); while (next != null) { if (next.getFile().exists()) { return scanner.forceInput(next.getFile(), next.getOffset()); } else { log.error("Replay file {} has disappeared", next.getFile()); } next = pendingReplays.poll(); } // look for a new file FileInputStream r = scanner.nextInput(); if (r != null) { parser = factory.createParser(r); return r; } else { return currentInput; } } @Override public void ack(Object messageId) { if (messageId instanceof Long) { if (ackBuffer.remove(messageId) == null) { log.error("Unknown messageId {}", messageId); } } else { log.error("Incorrect message id {}", messageId); } } @Override public void fail(Object messageId) { if (messageId instanceof Long) { final PendingMessage message = ackBuffer.get(messageId); if (message != null) { collector.emit(message.getTuple(), messageId); } else { log.error("Unknown message id {}", messageId); } } else { log.error("Incorrect message id {}", messageId); } } }