/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.spout; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.storm.spout.Scheme; import org.apache.storm.spout.SpoutOutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.topology.base.BaseRichSpout; /** * Reads the lines from a UTF-8 file and use them as a spout. Load the entire * content into memory */ @SuppressWarnings("serial") public class FileSpout extends BaseRichSpout { public static final int BATCH_SIZE = 10000; public static final Logger LOG = LoggerFactory.getLogger(FileSpout.class); private SpoutOutputCollector _collector; private Queue<String> _inputFiles; private BufferedReader currentBuffer; private Scheme _scheme; private LinkedList<byte[]> buffer = new LinkedList<>(); private boolean active; public FileSpout(String dir, String filter, Scheme scheme) { Path pdir = Paths.get(dir); _inputFiles = new LinkedList<>(); try (DirectoryStream<Path> stream = Files.newDirectoryStream(pdir, filter)) { for (Path entry : stream) { String inputFile = entry.toAbsolutePath().toString(); _inputFiles.add(inputFile); LOG.info("Input : {}", inputFile); } } catch (IOException ioe) { LOG.error("IOException: %s%n", ioe); } _scheme = scheme; } public FileSpout(String file, Scheme scheme) { this(scheme, file); } public FileSpout(Scheme scheme, String... files) { if (files.length == 0) { throw new IllegalArgumentException( "Must configure at least one inputFile"); } _scheme = scheme; _inputFiles = new LinkedList<>(); for (String f : files) _inputFiles.add(f); } private void populateBuffer() throws IOException { if (currentBuffer == null) { String file = _inputFiles.poll(); if (file == null) return; Path inputPath = Paths.get(file); currentBuffer = new BufferedReader(new FileReader( inputPath.toFile())); } // no more files to read from if (currentBuffer == null) return; String line = null; int linesRead = 0; while (linesRead < BATCH_SIZE && (line = currentBuffer.readLine()) != null) { if (StringUtils.isBlank(line)) continue; if (line.startsWith("#")) continue; buffer.add(line.trim().getBytes(StandardCharsets.UTF_8)); linesRead++; } // finished the file? if (line == null) { currentBuffer.close(); currentBuffer = null; } } @SuppressWarnings("rawtypes") @Override public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { _collector = collector; try { populateBuffer(); } catch (IOException e) { throw new RuntimeException(e); } } @Override public void nextTuple() { if (!active) return; if (buffer.isEmpty()) { try { populateBuffer(); } catch (IOException e) { throw new RuntimeException(e); } } // still empty? if (buffer.isEmpty()) return; byte[] head = buffer.removeFirst(); List<Object> fields = this._scheme.deserialize(ByteBuffer.wrap(head)); this._collector.emit(fields, fields.get(0).toString()); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(_scheme.getOutputFields()); } @Override public void close() { } @Override public void activate() { super.activate(); active = true; } @Override public void deactivate() { super.deactivate(); active = false; } }