//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.collectionreaders; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.core.utils.BaleenDefaults; import uk.gov.dstl.baleen.exceptions.InvalidParameterException; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.uima.BaleenCollectionReader; import uk.gov.dstl.baleen.uima.IContentExtractor; /** * Reads a file, and treats each line as a separate document. * Once completed, the pipeline will stay active (because pipelines are persistent in Baleen), * but the file will not be monitored for changes and no further documents will be processed. * * This could be useful for processing a CSV, where you want each line of the CSV to be processed separately. * * @baleen.javadoc */ public class LineReader extends BaleenCollectionReader { /** * The file to process * * @baleen.config */ public static final String PARAM_FILE = "file"; @ConfigurationParameter(name = PARAM_FILE, defaultValue = "") private File file; /** * The content extractor to use to extract content from files * * @baleen.config Value of BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR */ public static final String PARAM_CONTENT_EXTRACTOR = "contentExtractor"; @ConfigurationParameter(name = PARAM_CONTENT_EXTRACTOR, defaultValue=BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR) private String contentExtractor; private BufferedReader br; private String line; private Integer lineNumber = 0; private IContentExtractor extractor; @Override protected void doInitialize(UimaContext context) throws ResourceInitializationException { if(file == null || !file.canRead() || !file.isFile()){ throw new ResourceInitializationException( new InvalidParameterException("Specified parameter '"+PARAM_FILE+"' was not valid") ); } try{ br = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8)); }catch(IOException ioe){ throw new ResourceInitializationException(ioe); } try{ extractor = getContentExtractor(contentExtractor); }catch(InvalidParameterException ipe){ throw new ResourceInitializationException(ipe); } extractor.initialize(context, getConfigParameters(context)); } @Override public boolean doHasNext() throws IOException, CollectionException { if(br == null) return false; while((line = br.readLine()) != null){ lineNumber++; line = line.trim(); if(!line.isEmpty()){ return true; } } try{ br.close(); }catch(IOException ioe){ getMonitor().debug("An error occurred when closing the BufferedReader", ioe); } br = null; return false; } @Override protected void doGetNext(JCas jCas) throws IOException, CollectionException { InputStream is = IOUtils.toInputStream(line, Charset.defaultCharset()); extractor.processStream(is, file.getPath() + "#" + lineNumber, jCas); Metadata md = new Metadata(jCas); md.setKey("lineNumber"); md.setValue(lineNumber.toString()); getSupport().add(md); } @Override protected void doClose() throws IOException { if(br != null){ try{ br.close(); }catch(IOException ioe){ getMonitor().debug("An error occurred when closing the BufferedReader", ioe); } br = null; } if(extractor != null) { extractor.destroy(); extractor = null; } } }