package com.alibaba.jstorm.hdfs.spout; import java.io.IOException; import java.lang.reflect.Constructor; import java.net.URI; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicBoolean; import backtype.storm.Config; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.tuple.Fields; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import com.alibaba.jstorm.hdfs.common.HdfsUtils; import com.alibaba.jstorm.hdfs.common.security.HdfsSecurityUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HdfsSpout extends BaseRichSpout { // user configurable private String hdfsUri; // required private String readerType; // required private Fields outputFields; // required private Path sourceDirPath; // required private Path archiveDirPath; // required private Path badFilesDirPath; // required private Path lockDirPath; private int commitFrequencyCount = Configs.DEFAULT_COMMIT_FREQ_COUNT; private int commitFrequencySec = Configs.DEFAULT_COMMIT_FREQ_SEC; private int maxOutstanding = Configs.DEFAULT_MAX_OUTSTANDING; private int lockTimeoutSec = Configs.DEFAULT_LOCK_TIMEOUT; private boolean clocksInSync = true; private String inprogress_suffix = ".inprogress"; private String ignoreSuffix = ".ignore"; // other members private static final Logger LOG = LoggerFactory.getLogger(HdfsSpout.class); private ProgressTracker tracker = null; private FileSystem hdfs; private FileReader reader; private SpoutOutputCollector collector; HashMap<MessageId, List<Object> > inflight = new HashMap<>(); LinkedBlockingQueue<HdfsUtils.Pair<MessageId, List<Object>>> retryList = new LinkedBlockingQueue<>(); private Configuration hdfsConfig; private Map conf = null; private FileLock lock; private String spoutId = null; HdfsUtils.Pair<Path,FileLock.LogEntry> lastExpiredLock = null; private long lastExpiredLockTime = 0; private long tupleCounter = 0; private boolean ackEnabled = false; private int acksSinceLastCommit = 0 ; private final AtomicBoolean commitTimeElapsed = new AtomicBoolean(false); private Timer commitTimer; private boolean fileReadCompletely = true; private String configKey = Configs.DEFAULT_HDFS_CONFIG_KEY; // key for hdfs Kerberos configs public HdfsSpout() { } /** Name of the output field names. Number of fields depends upon the reader type */ public HdfsSpout withOutputFields(String... fields) { outputFields = new Fields(fields); return this; } /** set key name under which HDFS options are placed. (similar to HDFS bolt). * default key name is 'hdfs.config' */ public HdfsSpout withConfigKey(String configKey) { this.configKey = configKey; return this; } public Path getLockDirPath() { return lockDirPath; } public SpoutOutputCollector getCollector() { return collector; } public void nextTuple() { LOG.trace("Next Tuple {}", spoutId); // 1) First re-emit any previously failed tuples (from retryList) if (!retryList.isEmpty()) { LOG.debug("Sending tuple from retry list"); HdfsUtils.Pair<MessageId, List<Object>> pair = retryList.remove(); emitData(pair.getValue(), pair.getKey()); return; } if( ackEnabled && tracker.size()>= maxOutstanding) { LOG.warn("Waiting for more ACKs before generating new tuples. " + "Progress tracker size has reached limit {}, SpoutID {}" , maxOutstanding, spoutId); // Don't emit anything .. allow configured spout wait strategy to kick in return; } // 2) If no failed tuples to be retried, then send tuples from hdfs while (true) { try { // 3) Select a new file if one is not open already if (reader == null) { reader = pickNextFile(); if (reader == null) { LOG.debug("Currently no new files to process under : " + sourceDirPath); return; } else { fileReadCompletely=false; } } if( fileReadCompletely ) { // wait for more ACKs before proceeding return; } // 4) Read record from file, emit to collector and record progress List<Object> tuple = reader.next(); if (tuple != null) { fileReadCompletely= false; ++tupleCounter; MessageId msgId = new MessageId(tupleCounter, reader.getFilePath(), reader.getFileOffset()); emitData(tuple, msgId); if(!ackEnabled) { ++acksSinceLastCommit; // assume message is immediately ACKed in non-ack mode commitProgress(reader.getFileOffset()); } else { commitProgress(tracker.getCommitPosition()); } return; } else { fileReadCompletely = true; if(!ackEnabled) { markFileAsDone(reader.getFilePath()); } } } catch (IOException e) { LOG.error("I/O Error processing at file location " + getFileProgress(reader), e); // don't emit anything .. allow configured spout wait strategy to kick in return; } catch (ParseException e) { LOG.error("Parsing error when processing at file location " + getFileProgress(reader) + ". Skipping remainder of file.", e); markFileAsBad(reader.getFilePath()); // Note: We don't return from this method on ParseException to avoid triggering the // spout wait strategy (due to no emits). Instead we go back into the loop and // generate a tuple from next file } } // while } // will commit progress into lock file if commit threshold is reached private void commitProgress(FileOffset position) { if(position==null) { return; } if ( lock!=null && canCommitNow() ) { try { String pos = position.toString(); lock.heartbeat(pos); LOG.debug("{} Committed progress. {}", spoutId, pos); acksSinceLastCommit = 0; commitTimeElapsed.set(false); setupCommitElapseTimer(); } catch (IOException e) { LOG.error("Unable to commit progress Will retry later. Spout ID = " + spoutId, e); } } } private void setupCommitElapseTimer() { if(commitFrequencySec<=0) { return; } TimerTask timerTask = new TimerTask() { @Override public void run() { commitTimeElapsed.set(true); } }; commitTimer.schedule(timerTask, commitFrequencySec * 1000); } private static String getFileProgress(FileReader reader) { return reader.getFilePath() + " " + reader.getFileOffset(); } private void markFileAsDone(Path filePath) { try { Path newFile = renameCompletedFile(reader.getFilePath()); LOG.info("Completed processing {}. Spout Id = {}", newFile, spoutId); } catch (IOException e) { LOG.error("Unable to archive completed file" + filePath + " Spout ID " + spoutId, e); } closeReaderAndResetTrackers(); } private void markFileAsBad(Path file) { String fileName = file.toString(); String fileNameMinusSuffix = fileName.substring(0, fileName.indexOf(inprogress_suffix)); String originalName = new Path(fileNameMinusSuffix).getName(); Path newFile = new Path( badFilesDirPath + Path.SEPARATOR + originalName); LOG.info("Moving bad file {} to {}. Processed it till offset {}. SpoutID= {}", originalName, newFile, tracker.getCommitPosition(), spoutId); try { if (!hdfs.rename(file, newFile) ) { // seems this can fail by returning false or throwing exception throw new IOException("Move failed for bad file: " + file); // convert false ret value to exception } } catch (IOException e) { LOG.warn("Error moving bad file: " + file + " to destination " + newFile + " SpoutId =" + spoutId, e); } closeReaderAndResetTrackers(); } private void closeReaderAndResetTrackers() { inflight.clear(); tracker.offsets.clear(); retryList.clear(); reader.close(); reader = null; releaseLockAndLog(lock, spoutId); lock = null; } private static void releaseLockAndLog(FileLock fLock, String spoutId) { try { if(fLock!=null) { fLock.release(); LOG.debug("Spout {} released FileLock. SpoutId = {}", fLock.getLockFile(), spoutId); } } catch (IOException e) { LOG.error("Unable to delete lock file : " +fLock.getLockFile() + " SpoutId =" + spoutId, e); } } protected void emitData(List<Object> tuple, MessageId id) { LOG.trace("Emitting - {}", id); this.collector.emit(tuple, id); inflight.put(id, tuple); } public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { LOG.info("Opening HDFS Spout"); this.conf = conf; this.commitTimer = new Timer(); this.tracker = new ProgressTracker(); this.hdfsConfig = new Configuration(); this.collector = collector; this.hdfsConfig = new Configuration(); this.tupleCounter = 0; // Hdfs related settings if( conf.containsKey(Configs.HDFS_URI)) { this.hdfsUri = conf.get(Configs.HDFS_URI).toString(); } else { throw new RuntimeException(Configs.HDFS_URI + " setting is required"); } try { this.hdfs = FileSystem.get(URI.create(hdfsUri), hdfsConfig); } catch (IOException e) { LOG.error("Unable to instantiate file system", e); throw new RuntimeException("Unable to instantiate file system", e); } if ( conf.containsKey(configKey) ) { Map<String, Object> map = (Map<String, Object>)conf.get(configKey); if(map != null) { for(String keyName : map.keySet()){ LOG.info("HDFS Config override : {} = {} ", keyName, String.valueOf(map.get(keyName))); this.hdfsConfig.set(keyName, String.valueOf(map.get(keyName))); } try { HdfsSecurityUtil.login(conf, hdfsConfig); } catch (IOException e) { LOG.error("HDFS Login failed ", e); throw new RuntimeException(e); } } // if(map != null) } // Reader type config if( conf.containsKey(Configs.READER_TYPE) ) { readerType = conf.get(Configs.READER_TYPE).toString(); checkValidReader(readerType); } // -- source dir config if ( !conf.containsKey(Configs.SOURCE_DIR) ) { LOG.error(Configs.SOURCE_DIR + " setting is required"); throw new RuntimeException(Configs.SOURCE_DIR + " setting is required"); } this.sourceDirPath = new Path( conf.get(Configs.SOURCE_DIR).toString() ); // -- archive dir config if ( !conf.containsKey(Configs.ARCHIVE_DIR) ) { LOG.error(Configs.ARCHIVE_DIR + " setting is required"); throw new RuntimeException(Configs.ARCHIVE_DIR + " setting is required"); } this.archiveDirPath = new Path( conf.get(Configs.ARCHIVE_DIR).toString() ); validateOrMakeDir(hdfs, archiveDirPath, "Archive"); // -- bad files dir config if ( !conf.containsKey(Configs.BAD_DIR) ) { LOG.error(Configs.BAD_DIR + " setting is required"); throw new RuntimeException(Configs.BAD_DIR + " setting is required"); } this.badFilesDirPath = new Path(conf.get(Configs.BAD_DIR).toString()); validateOrMakeDir(hdfs, badFilesDirPath, "bad files"); // -- ignore file names config if ( conf.containsKey(Configs.IGNORE_SUFFIX) ) { this.ignoreSuffix = conf.get(Configs.IGNORE_SUFFIX).toString(); } // -- lock dir config String lockDir = !conf.containsKey(Configs.LOCK_DIR) ? getDefaultLockDir(sourceDirPath) : conf.get(Configs.LOCK_DIR).toString() ; this.lockDirPath = new Path(lockDir); validateOrMakeDir(hdfs,lockDirPath,"locks"); // -- lock timeout if( conf.get(Configs.LOCK_TIMEOUT) !=null ) { this.lockTimeoutSec = Integer.parseInt(conf.get(Configs.LOCK_TIMEOUT).toString()); } // -- enable/disable ACKing Object ackers = conf.get(Config.TOPOLOGY_ACKER_EXECUTORS); if( ackers!=null ) { int ackerCount = Integer.parseInt(ackers.toString()); this.ackEnabled = (ackerCount>0); LOG.debug("ACKer count = {}", ackerCount); } else { // ackers==null when ackerCount not explicitly set on the topology this.ackEnabled = true; LOG.debug("ACK count not explicitly set on topology."); } LOG.info("ACK mode is {}", ackEnabled ? "enabled" : "disabled"); // -- commit frequency - count if( conf.get(Configs.COMMIT_FREQ_COUNT) != null ) { commitFrequencyCount = Integer.parseInt(conf.get(Configs.COMMIT_FREQ_COUNT).toString()); } // -- commit frequency - seconds if( conf.get(Configs.COMMIT_FREQ_SEC) != null ) { commitFrequencySec = Integer.parseInt(conf.get(Configs.COMMIT_FREQ_SEC).toString()); if(commitFrequencySec<=0) { throw new RuntimeException(Configs.COMMIT_FREQ_SEC + " setting must be greater than 0"); } } // -- max outstanding tuples if( conf.get(Configs.MAX_OUTSTANDING) !=null ) { maxOutstanding = Integer.parseInt(conf.get(Configs.MAX_OUTSTANDING).toString()); } // -- clocks in sync if( conf.get(Configs.CLOCKS_INSYNC) !=null ) { clocksInSync = Boolean.parseBoolean(conf.get(Configs.CLOCKS_INSYNC).toString()); } // -- spout id spoutId = context.getThisComponentId(); // setup timer for commit elapse time tracking setupCommitElapseTimer(); } private static void validateOrMakeDir(FileSystem fs, Path dir, String dirDescription) { try { if(fs.exists(dir)) { if(! fs.isDirectory(dir) ) { LOG.error(dirDescription + " directory is a file, not a dir. " + dir); throw new RuntimeException(dirDescription + " directory is a file, not a dir. " + dir); } } else if(! fs.mkdirs(dir) ) { LOG.error("Unable to create " + dirDescription + " directory " + dir); throw new RuntimeException("Unable to create " + dirDescription + " directory " + dir); } } catch (IOException e) { LOG.error("Unable to create " + dirDescription + " directory " + dir, e); throw new RuntimeException("Unable to create " + dirDescription + " directory " + dir, e); } } private String getDefaultLockDir(Path sourceDirPath) { return sourceDirPath.toString() + Path.SEPARATOR + Configs.DEFAULT_LOCK_DIR; } private static void checkValidReader(String readerType) { if(readerType.equalsIgnoreCase(Configs.TEXT) || readerType.equalsIgnoreCase(Configs.SEQ) ) return; try { Class<?> classType = Class.forName(readerType); classType.getConstructor(FileSystem.class, Path.class, Map.class); return; } catch (ClassNotFoundException e) { LOG.error(readerType + " not found in classpath.", e); throw new IllegalArgumentException(readerType + " not found in classpath.", e); } catch (NoSuchMethodException e) { LOG.error(readerType + " is missing the expected constructor for Readers.", e); throw new IllegalArgumentException(readerType + " is missing the expected constuctor for Readers."); } } @Override public void ack(Object msgId) { LOG.trace("Ack received for msg {} on spout {}", msgId, spoutId); if(!ackEnabled) { return; } MessageId id = (MessageId) msgId; inflight.remove(id); ++acksSinceLastCommit; tracker.recordAckedOffset(id.offset); commitProgress(tracker.getCommitPosition()); if(fileReadCompletely && inflight.isEmpty()) { markFileAsDone(reader.getFilePath()); reader = null; } super.ack(msgId); } private boolean canCommitNow() { if( commitFrequencyCount>0 && acksSinceLastCommit >= commitFrequencyCount ) { return true; } return commitTimeElapsed.get(); } @Override public void fail(Object msgId) { LOG.trace("Fail received for msg id {} on spout {}", msgId, spoutId); super.fail(msgId); if(ackEnabled) { HdfsUtils.Pair<MessageId, List<Object>> item = HdfsUtils.Pair.of(msgId, inflight.remove(msgId)); retryList.add(item); } } private FileReader pickNextFile() { try { // 1) If there are any abandoned files, pick oldest one lock = getOldestExpiredLock(); if (lock != null) { LOG.debug("Spout {} now took over ownership of abandoned FileLock {}", spoutId, lock.getLockFile()); Path file = getFileForLockFile(lock.getLockFile(), sourceDirPath); String resumeFromOffset = lock.getLastLogEntry().fileOffset; LOG.info("Resuming processing of abandoned file : {}", file); return createFileReader(file, resumeFromOffset); } // 2) If no abandoned files, then pick oldest file in sourceDirPath, lock it and rename it Collection<Path> listing = HdfsUtils.listFilesByModificationTime(hdfs, sourceDirPath, 0); for (Path file : listing) { if (file.getName().endsWith(inprogress_suffix)) { continue; } if (file.getName().endsWith(ignoreSuffix)) { continue; } lock = FileLock.tryLock(hdfs, file, lockDirPath, spoutId); if (lock == null) { LOG.debug("Unable to get FileLock for {}, so skipping it.", file); continue; // could not lock, so try another file. } try { Path newFile = renameToInProgressFile(file); FileReader result = createFileReader(newFile); LOG.info("Processing : {} ", file); return result; } catch (Exception e) { LOG.error("Skipping file " + file, e); releaseLockAndLog(lock, spoutId); continue; } } return null; } catch (IOException e) { LOG.error("Unable to select next file for consumption " + sourceDirPath, e); return null; } } /** * If clocks in sync, then acquires the oldest expired lock * Else, on first call, just remembers the oldest expired lock, on next call check if the lock is updated. if not updated then acquires the lock * @return a lock object * @throws IOException */ private FileLock getOldestExpiredLock() throws IOException { // 1 - acquire lock on dir DirLock dirlock = DirLock.tryLock(hdfs, lockDirPath); if (dirlock == null) { dirlock = DirLock.takeOwnershipIfStale(hdfs, lockDirPath, lockTimeoutSec); if (dirlock == null) { LOG.debug("Spout {} could not take over ownership of DirLock for {}", spoutId, lockDirPath); return null; } LOG.debug("Spout {} now took over ownership of abandoned DirLock for {}", spoutId, lockDirPath); } else { LOG.debug("Spout {} now owns DirLock for {}", spoutId, lockDirPath); } try { // 2 - if clocks are in sync then simply take ownership of the oldest expired lock if (clocksInSync) { return FileLock.acquireOldestExpiredLock(hdfs, lockDirPath, lockTimeoutSec, spoutId); } // 3 - if clocks are not in sync .. if( lastExpiredLock == null ) { // just make a note of the oldest expired lock now and check if its still unmodified after lockTimeoutSec lastExpiredLock = FileLock.locateOldestExpiredLock(hdfs, lockDirPath, lockTimeoutSec); lastExpiredLockTime = System.currentTimeMillis(); return null; } // see if lockTimeoutSec time has elapsed since we last selected the lock file if( hasExpired(lastExpiredLockTime) ) { return null; } // If lock file has expired, then own it FileLock.LogEntry lastEntry = FileLock.getLastEntry(hdfs, lastExpiredLock.getKey()); if( lastEntry.equals(lastExpiredLock.getValue()) ) { FileLock result = FileLock.takeOwnership(hdfs, lastExpiredLock.getKey(), lastEntry, spoutId); lastExpiredLock = null; return result; } else { // if lock file has been updated since last time, then leave this lock file alone lastExpiredLock = null; return null; } } finally { dirlock.release(); LOG.debug("Released DirLock {}, SpoutID {} ", dirlock.getLockFile(), spoutId); } } private boolean hasExpired(long lastModifyTime) { return (System.currentTimeMillis() - lastModifyTime ) < lockTimeoutSec*1000; } /** * Creates a reader that reads from beginning of file * @param file file to read * @return * @throws IOException */ private FileReader createFileReader(Path file) throws IOException { if(readerType.equalsIgnoreCase(Configs.SEQ)) { return new SequenceFileReader(this.hdfs, file, conf); } if(readerType.equalsIgnoreCase(Configs.TEXT)) { return new TextFileReader(this.hdfs, file, conf); } try { Class<?> clsType = Class.forName(readerType); Constructor<?> constructor = clsType.getConstructor(FileSystem.class, Path.class, Map.class); return (FileReader) constructor.newInstance(this.hdfs, file, conf); } catch (Exception e) { LOG.error(e.getMessage(), e); throw new RuntimeException("Unable to instantiate " + readerType + " reader", e); } } /** * Creates a reader that starts reading from 'offset' * @param file the file to read * @param offset the offset string should be understandable by the reader type being used * @return * @throws IOException */ private FileReader createFileReader(Path file, String offset) throws IOException { if(readerType.equalsIgnoreCase(Configs.SEQ)) { return new SequenceFileReader(this.hdfs, file, conf, offset); } if(readerType.equalsIgnoreCase(Configs.TEXT)) { return new TextFileReader(this.hdfs, file, conf, offset); } try { Class<?> clsType = Class.forName(readerType); Constructor<?> constructor = clsType.getConstructor(FileSystem.class, Path.class, Map.class, String.class); return (FileReader) constructor.newInstance(this.hdfs, file, conf, offset); } catch (Exception e) { LOG.error(e.getMessage(), e); throw new RuntimeException("Unable to instantiate " + readerType, e); } } /** * Renames files with .inprogress suffix * @return path of renamed file * @throws if operation fails */ private Path renameToInProgressFile(Path file) throws IOException { Path newFile = new Path( file.toString() + inprogress_suffix ); try { if (hdfs.rename(file, newFile)) { return newFile; } throw new RenameException(file, newFile); } catch (IOException e){ throw new RenameException(file, newFile, e); } } /** Returns the corresponding input file in the 'sourceDirPath' for the specified lock file. * If no such file is found then returns null */ private Path getFileForLockFile(Path lockFile, Path sourceDirPath) throws IOException { String lockFileName = lockFile.getName(); Path dataFile = new Path(sourceDirPath + Path.SEPARATOR + lockFileName + inprogress_suffix); if( hdfs.exists(dataFile) ) { return dataFile; } dataFile = new Path(sourceDirPath + Path.SEPARATOR + lockFileName); if(hdfs.exists(dataFile)) { return dataFile; } return null; } // renames files and returns the new file path private Path renameCompletedFile(Path file) throws IOException { String fileName = file.toString(); String fileNameMinusSuffix = fileName.substring(0, fileName.indexOf(inprogress_suffix)); String newName = new Path(fileNameMinusSuffix).getName(); Path newFile = new Path( archiveDirPath + Path.SEPARATOR + newName ); LOG.info("Completed consuming file {}", fileNameMinusSuffix); if (!hdfs.rename(file, newFile) ) { throw new IOException("Rename failed for file: " + file); } LOG.debug("Renamed file {} to {} ", file, newFile); return newFile; } public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(outputFields); } static class MessageId implements Comparable<MessageId> { public long msgNumber; // tracks order in which msg came in public String fullPath; public FileOffset offset; public MessageId(long msgNumber, Path fullPath, FileOffset offset) { this.msgNumber = msgNumber; this.fullPath = fullPath.toString(); this.offset = offset; } @Override public String toString() { return "{'" + fullPath + "':" + offset + "}"; } @Override public int compareTo(MessageId rhs) { if (msgNumber<rhs.msgNumber) { return -1; } if(msgNumber>rhs.msgNumber) { return 1; } return 0; } } private static class RenameException extends IOException { public final Path oldFile; public final Path newFile; public RenameException(Path oldFile, Path newFile) { super("Rename of " + oldFile + " to " + newFile + " failed"); this.oldFile = oldFile; this.newFile = newFile; } public RenameException(Path oldFile, Path newFile, IOException cause) { super("Rename of " + oldFile + " to " + newFile + " failed", cause); this.oldFile = oldFile; this.newFile = newFile; } } }