/* * #! * % * Copyright (C) 2014 - 2016 Humboldt-Universität zu Berlin * % * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #_ */ package de.hub.cs.dbis.aeolus.spouts; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.tuple.Values; /** * {@link AbstractOrderedFileInputSpout} is an {@link AbstractOrderedInputSpout} that reads input tuples line by line * from multiple files (ie, each line must contain exactly one tuple). Each file is an <em>input partition</em> in * {@link AbstractOrderedInputSpout} terminology. The default file is {@code input}. Use {@link #INPUT_FILE_NAME} and * {@link #INPUT_FILE_SUFFIXES} to specify different file name(s) in the topology configuration (see * {@link backtype.storm.Config}).<br/> * <br/> * <strong>Output schema:</strong> {@code <ts:}{@link Long}{@code ,rawTuple:}{@link String}{@code >}<br/> * * @author Leonardo Aniello (Sapienza Università di Roma, Roma, Italy) * @author Roberto Baldoni (Sapienza Università di Roma, Roma, Italy) * @author Leonardo Querzoni (Sapienza Università di Roma, Roma, Italy) * @author mjsax */ public abstract class AbstractOrderedFileInputSpout extends AbstractOrderedInputSpout<String> { private final static long serialVersionUID = -4690963122364704481L; private final static Logger logger = LoggerFactory.getLogger(AbstractOrderedFileInputSpout.class); /** * Can be used to specify an input file name (or prefix together with {@link #INPUT_FILE_SUFFIXES}). The * configuration value is expected to be of type {@link String}. */ public final static String INPUT_FILE_NAME = "OrderedFileInputSpout.input"; /** * Can be used to specify a list of file name suffixes (one suffix for each input file) if multiple input files are * used. {@link #INPUT_FILE_NAME} is used as file prefix for each file. The configuration value is expected to be of * type {@link List}. */ public final static String INPUT_FILE_SUFFIXES = "OrderedFileInputSpout.inputFileSuffixes"; /** The prefix of all input file names. */ private String prefix = "input"; /** All input files to read from. */ private final ArrayList<BufferedReader> inputFiles = new ArrayList<BufferedReader>(); /** Emit-Round-Robin-Index. */ private int emitIndex = -1; /** * Map containing all tuples emitted by the last call of {@link #emitNextTuple(Integer, Long, Object)}. The map * including the task IDs each tuple was sent to. */ protected Map<Values, List<Integer>> emitted = new HashMap<Values, List<Integer>>(); /** * Creates a {@code AbstractOrderedFileInputSpout} which declares fields without explicit stream ID. */ public AbstractOrderedFileInputSpout() {} /** * Creates a {@code AbstractOrderedFileInputSpout} which declares fields on stream with ID {@code streamID}. * * @param streamID * the ID of the stream the fields ought to be declared on */ public AbstractOrderedFileInputSpout(String streamID) { super(streamID); } @SuppressWarnings("unchecked") @Override public void open(@SuppressWarnings("rawtypes") Map conf, TopologyContext context, SpoutOutputCollector collector) { String fileName = (String)conf.get(INPUT_FILE_NAME); if(fileName != null) { this.prefix = fileName; } List<Object> suffixes = (List<Object>)conf.get(INPUT_FILE_SUFFIXES); if(suffixes != null) { // distribute input files over all tasks using round robin int componentTaskCount = context.getComponentTasks(context.getThisComponentId()).size(); for(int index = context.getThisTaskIndex(); index < suffixes.size(); index += componentTaskCount) { try { logger.debug("Adding partition input file {}", this.prefix + suffixes.get(index)); this.inputFiles.add(new BufferedReader(new FileReader(this.prefix + suffixes.get(index)))); } catch(FileNotFoundException e) { logger.error("Input file <{}> not found.", this.prefix + suffixes.get(index)); } } } else { try { logger.debug("Adding single input file {}", this.prefix); this.inputFiles.add(new BufferedReader(new FileReader(this.prefix))); } catch(FileNotFoundException e) { logger.error("Input file <{}> not found:", this.prefix); } } @SuppressWarnings("rawtypes") Map newConfig = new HashMap(conf); // need to copy into new HashMap because given one is read-only newConfig.put(NUMBER_OF_PARTITIONS, new Integer(this.inputFiles.size())); super.open(newConfig, context, collector); } /** * {@inheritDoc} * * Reads a line from at least one input file, and emits all eligible tuples (including previously buffered once). * The emitted tuples can be retrieved via {@link #emitted}. */ @Override public void nextTuple() { final int numberOfFiles = this.inputFiles.size(); for(int i = 0; i < numberOfFiles; ++i) { this.emitIndex = (this.emitIndex + 1) % numberOfFiles; if(this.inputFiles.get(this.emitIndex) == null) { // check for closed partition continue; } String line = null; Integer emitIndexAsObject = new Integer(this.emitIndex); try { logger.trace("Read from partition {}", emitIndexAsObject); line = this.inputFiles.get(this.emitIndex).readLine(); } catch(IOException e) { logger.error(e.toString()); } if(line != null) { try { this.emitted = super.emitNextTuple(emitIndexAsObject, new Long(this.extractTimestamp(line)), line); logger.trace("Emitted the following tuples {}", this.emitted); if(this.emitted.size() != 0) { return; } } catch(ParseException e) { logger.error(e.toString()); } } else { logger.debug("Try to close empty partition {}", emitIndexAsObject); if(super.closePartition(emitIndexAsObject)) { try { this.inputFiles.get(this.emitIndex).close(); } catch(IOException e) { logger.error("Closing input file reader failed.", e); } // set to null but do not remove from list -> would change partition IDs in super.emitNextTuple this.inputFiles.set(this.emitIndex, null); } else { // we cannot put any more data, this.emitted = super.emitNextTuple(null, null, null); logger.trace("Emitted the following tuples {}", this.emitted); } } } } /** * Extracts the timestamp from the given tuple. * * @param tuple * The tuple to be processed. * * @return The tuple's timestamp. * * @throws ParseException * if the timestamp could not be extracted */ protected abstract long extractTimestamp(String tuple) throws ParseException; @Override public void close() { for(BufferedReader reader : this.inputFiles) { try { if(reader != null) { reader.close(); } } catch(IOException e) { logger.error("Closing input file reader failed.", e); } } } }