/* * Copyright MapR Technologies, $year * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mapr; import backtype.storm.spout.SpoutOutputCollector; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.mapr.storm.DirectoryScanner; import com.mapr.storm.PendingMessage; import com.mapr.storm.SpoutState; import com.mapr.storm.streamparser.StreamParser; import com.mapr.storm.streamparser.StreamParserFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.Serializable; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.regex.Pattern; /** * This is a spout that reads records from files. New records can be appended to * these files while this spout is running. This will cause these new records to * be emitted the next time around the event loop. It is also possible that we * will hit the end of a file of records and notice that a new file has been * started. In that case, we will move to that new file and start emitting the * records we read from that file. * <p/> * There are two modes in which a TailSpout can operate. In unreliable mode, * transactions are read, but ack's and fail's are ignored so that if * transactions are not processed, they are lost. The TailSpout will checkpoint * it's own progress periodically and on deactivation so that when the TailSpout * is restarted, it will start from close to where it left off. The restart will * be exact with a clean shutdown and can be made arbitrarily precise with an * unclean shutdown. * <p/> * In the reliable mode, an in-memory table of unacknowledged transactions is * kept. Any that fail are retransmitted and those that succeed are removed from * the table. When a checkpoint is done, the file name and offset of the * earliest unacknowledged transaction in that file will be retained. On * restart, each of the files with unacknowledged transactions will be replayed * from that point. Usually only one file will be replaced (the current one) or * possibly the current and previous file. This will generally replay a few * extra transactions, but if there is a cool-down period on shutdown, this will * should very few extra transactions transmitted on startup. * <p/> * One possible extension would be to allow the system to run in reliable mode, * but not save the offsets for unacked transactions. This would be good during * orderly shutdowns, but very bad in the event of an unorderly shutdown. */ public class DirectoryObserver implements Serializable, Closeable { private final Logger log = LoggerFactory.getLogger(DirectoryObserver.class); private DirectoryScanner scanner; private FileInputStream currentInput = null; // these are set in the constructors private StreamParserFactory factory; private File statusFile; // all others are created on the fly private Map<Long, PendingMessage> ackBuffer = Maps.newTreeMap(); private long messageId = 0; // how often should we save our state? private long tuplesPerCheckpoint = 100; // time between forced checkpoints in seconds private double checkPointIntervalSeconds = 1.0; private Queue<PendingMessage> pendingReplays = Lists.newLinkedList(); private StreamParser parser = null; private double nextCheckPointTime = 0; public DirectoryObserver(StreamParserFactory factory, File statusFile) throws IOException { this.factory = factory; scanner = SpoutState.restoreState(pendingReplays, statusFile); } public DirectoryObserver(StreamParserFactory factory, File statusFile, File inputDirectory, final Pattern inputFileNamePattern) { this.factory = factory; this.statusFile = statusFile; scanner = new DirectoryScanner(inputDirectory, inputFileNamePattern); } public void setTuplesPerCheckpoint(long tuplesPerCheckpoint) { this.tuplesPerCheckpoint = tuplesPerCheckpoint; } public void setCheckPointIntervalSeconds(double checkPointIntervalSeconds) { this.checkPointIntervalSeconds = checkPointIntervalSeconds; } public void close() { SpoutState.recordCurrentState(ackBuffer, scanner, parser, statusFile); } public Message nextMessage() { if (currentInput == null) { currentInput = openNextInput(); } try { // TODO need to persist current reading state somewhere while (currentInput != null) { // read a record long position = parser.currentOffset(); List<Object> r = parser.nextRecord(); // assert currentInput != null if (r == null) { // reached end of current file // (currentInput != null && r == null) so we enter loop at least // once while (currentInput != null && r == null) { currentInput = openNextInput(); // assert r == null if (currentInput != null) { position = parser.currentOffset(); r = parser.nextRecord(); } // r != null => currentInput != null } // post: r != null iff currentInput != null } // post: (r != null iff currentInput != null) || (r != null) // post: (r == null => currentInput == null) if (r != null) { messageId++; if (messageId % tuplesPerCheckpoint == 0 || System.nanoTime() / 1e9 > nextCheckPointTime) { SpoutState.recordCurrentState(ackBuffer, scanner, parser, statusFile); nextCheckPointTime = System.nanoTime() / 1e9 + checkPointIntervalSeconds; } return new Message(messageId, r); } } } catch (IOException e) { log.error("DirectoryObserver threw I/O exception", e); throw new RuntimeException(e); } // exit only when all files have been processed completely return null; } public static class Message { private final long messageId; private final List<Object> tuple; public Message(long messageId, List<Object> tuple) { this.messageId = messageId; this.tuple = tuple; } public long getMessageId() { return messageId; } public List<Object> getTuple() { return tuple; } } private FileInputStream openNextInput() { PendingMessage next = pendingReplays.poll(); while (next != null) { if (next.getFile().exists()) { return scanner.forceInput(next.getFile(), next.getOffset()); } else { log.error("Replay file {} has disappeared", next.getFile()); } next = pendingReplays.poll(); } // look for a new file FileInputStream r = scanner.nextInput(); parser = factory.createParser(r); return r; } }