/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.storm.crawler.spout; import java.io.BufferedReader; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import backtype.storm.spout.Scheme; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichSpout; /** * Reads the lines from a UTF-8 file and use them as a spout. Load the entire * content into memory */ @SuppressWarnings("serial") public class FileSpout extends BaseRichSpout { private SpoutOutputCollector _collector; private String[] _inputFiles; private Scheme _scheme; private LinkedList<byte[]> toPut = new LinkedList<byte[]>(); private boolean active; public static final Logger LOG = LoggerFactory.getLogger(FileSpout.class); public FileSpout(String dir, String filter, Scheme scheme) { Path pdir = Paths.get(dir); List<String> f = new LinkedList<String>(); try (DirectoryStream<Path> stream = Files.newDirectoryStream(pdir, filter)) { for (Path entry : stream) { String inputFile = entry.toAbsolutePath().toString(); f.add(inputFile); LOG.info("Input : {}", inputFile); } } catch (IOException ioe) { LOG.error("IOException: %s%n", ioe); } _inputFiles = f.toArray(new String[f.size()]); _scheme = scheme; } public FileSpout(String file, Scheme scheme) { this(scheme, file); } public FileSpout(Scheme scheme, String... files) { if (files.length == 0) { throw new IllegalArgumentException( "Must configure at least one inputFile"); } _scheme = scheme; _inputFiles = files; } @SuppressWarnings("rawtypes") @Override public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { _collector = collector; for (String inputFile : _inputFiles) { Path inputPath = Paths.get(inputFile); try (BufferedReader reader = Files.newBufferedReader(inputPath, StandardCharsets.UTF_8)) { String line = null; while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(line)) continue; toPut.add(line.getBytes(StandardCharsets.UTF_8)); } } catch (IOException x) { System.err.format("IOException: %s%n", x); } } } @Override public void nextTuple() { if (!active) return; if (toPut.isEmpty()) return; byte[] head = toPut.removeFirst(); List<Object> fields = this._scheme.deserialize(head); this._collector.emit(fields, fields.get(0).toString()); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(_scheme.getOutputFields()); } @Override public void close() { } @Override public void activate() { super.activate(); active = true; } @Override public void deactivate() { super.deactivate(); active = false; } }