package storm.applications.spout; import java.io.File; import java.io.FileNotFoundException; import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.Scanner; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import storm.applications.constants.BaseConstants.BaseConf; import storm.applications.spout.parser.Parser; import storm.applications.util.config.ClassLoaderUtils; import storm.applications.util.io.FileUtils; import storm.applications.util.stream.StreamValues; /** * * @author Maycon Viana Bordin <mayconbordin@gmail.com> */ public class FileSpout extends AbstractSpout { private static final Logger LOG = LoggerFactory.getLogger(FileSpout.class); protected Parser parser; protected File[] files; protected Scanner scanner; protected int curFileIndex = 0; protected int curLineIndex = 0; private boolean finished = false; protected int taskId; protected int numTasks; @Override public void initialize() { taskId = context.getThisTaskIndex();//context.getThisTaskId(); numTasks = config.getInt(getConfigKey(BaseConf.SPOUT_THREADS)); String parserClass = config.getString(getConfigKey(BaseConf.SPOUT_PARSER)); parser = (Parser) ClassLoaderUtils.newInstance(parserClass, "parser", LOG); parser.initialize(config); buildIndex(); openNextFile(); } protected void buildIndex() { String path = config.getString(getConfigKey(BaseConf.SPOUT_PATH)); if (StringUtils.isBlank(path)) { LOG.error("The source path has not been set"); throw new RuntimeException("The source path has to beeen set"); } LOG.info("Source path: {}", path); File dir = new File(path); if (!dir.exists()) { LOG.error("The source path {} does not exists", path); throw new RuntimeException("The source path '" + path + "' does not exists"); } if (dir.isDirectory()) { files = dir.listFiles(); } else { files = new File[1]; files[0] = dir; } Arrays.sort(files, new Comparator<File>() { @Override public int compare(File f1, File f2) { int res = f1.lastModified() < f2.lastModified() ? -1 : ( f1.lastModified() > f2.lastModified() ? 1 : 0); return res; } }); LOG.info("Number of files to read: {}", files.length); } @Override public void nextTuple() { String value = readFile(); if (value == null) return; List<StreamValues> tuples = parser.parse(value); if (tuples != null) { for (StreamValues values : tuples) { String msgId = String.format("%d%d", curFileIndex, curLineIndex); collector.emit(values.getStreamId(), values, msgId); } } } protected String readFile() { if (finished) return null; String record = null; if (scanner.hasNextLine()) { record = readLine(); } else { if ((curFileIndex+1) < files.length) { openNextFile(); if (scanner.hasNextLine()) { record = readLine(); } } else { LOG.info("No more files to read"); finished = true; } } return record; } /** * Read one line from the currently open file. If there's only one file, each * instance of the spout will read only a portion of the file. * @return The line */ protected String readLine() { if (files.length == 1) { while (scanner.hasNextLine() && ++curLineIndex % numTasks != taskId) scanner.nextLine(); } if (scanner.hasNextLine()) return scanner.nextLine(); else return null; } /** * Opens the next file from the index. If there's multiple instances of the * spout, it will read only a portion of the files. */ protected void openNextFile() { while ((curFileIndex+1) % numTasks != taskId) { curFileIndex++; } if (curFileIndex < files.length) { try { File file = files[curFileIndex]; scanner = new Scanner(file); curLineIndex = 0; LOG.info("Opened file {}, size {}", file.getName(), FileUtils.humanReadableByteCount(file.length())); } catch (FileNotFoundException ex) { LOG.error(String.format("File %s not found", files[curFileIndex]), ex); throw new IllegalStateException("File not found", ex); } } } }