/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.source.extractor.filebased; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.Iterator; import java.util.List; import lombok.Getter; import org.apache.commons.lang3.reflect.ConstructorUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.io.Closer; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.WorkUnitState; import gobblin.instrumented.extractor.InstrumentedExtractor; import gobblin.metrics.Counters; import gobblin.source.extractor.DataRecordException; import gobblin.source.workunit.WorkUnit; /** * Abstract class for file based extractors * * @author stakiar * * @param <S> * type of schema * @param <D> * type of data record */ public class FileBasedExtractor<S, D> extends InstrumentedExtractor<S, D> { private static final Logger LOG = LoggerFactory.getLogger(FileBasedExtractor.class); protected final WorkUnit workUnit; protected final WorkUnitState workUnitState; protected final List<String> filesToPull; protected final FileDownloader<D> fileDownloader; private final int statusCount; private long totalRecordCount = 0; private Iterator<D> currentFileItr; private String currentFile; private boolean hasNext = false; @Getter protected final Closer closer = Closer.create(); @Getter private final boolean shouldSkipFirstRecord; @Getter protected final SizeAwareFileBasedHelper fsHelper; protected enum CounterNames { FileBytesRead; } protected Counters<CounterNames> counters = new Counters<>(); @SuppressWarnings("unchecked") public FileBasedExtractor(WorkUnitState workUnitState, FileBasedHelper fsHelper) { super(workUnitState); this.workUnitState = workUnitState; this.workUnit = workUnitState.getWorkunit(); this.filesToPull = Lists.newArrayList(workUnitState.getPropAsList(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, "")); this.statusCount = this.workUnit.getPropAsInt(ConfigurationKeys.FILEBASED_REPORT_STATUS_ON_COUNT, ConfigurationKeys.DEFAULT_FILEBASED_REPORT_STATUS_ON_COUNT); this.shouldSkipFirstRecord = this.workUnitState.getPropAsBoolean(ConfigurationKeys.SOURCE_SKIP_FIRST_RECORD, false); if (fsHelper instanceof SizeAwareFileBasedHelper) { this.fsHelper = (SizeAwareFileBasedHelper) fsHelper; } else { this.fsHelper = new SizeAwareFileBasedHelperDecorator(fsHelper); } try { this.fsHelper.connect(); } catch (FileBasedHelperException e) { throw new RuntimeException(e); } if (workUnitState.contains(ConfigurationKeys.SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS)) { try { this.fileDownloader = (FileDownloader<D>) ConstructorUtils.invokeConstructor( Class.forName(workUnitState.getProp(ConfigurationKeys.SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS)), this); } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException | ClassNotFoundException e) { throw new RuntimeException(e); } } else { this.fileDownloader = new SingleFileDownloader<>(this); } this.counters.initialize(getMetricContext(), CounterNames.class, this.getClass()); } /** * Initializes a list of files to pull on the first call to the method * Iterates through the file and returns a new record upon each call until * there are no more records left in the file, then it moves on to the next * file */ @Override public D readRecordImpl(@Deprecated D reuse) throws DataRecordException, IOException { this.totalRecordCount++; if (this.statusCount > 0 && this.totalRecordCount % this.statusCount == 0) { LOG.info("Total number of records processed so far: " + this.totalRecordCount); } // If records have been read, check the hasNext value, if not then get the next file to process if (this.currentFile != null && this.currentFileItr != null) { this.hasNext = this.currentFileItr.hasNext(); // If the current file is done, move to the next one if (!this.hasNext) { getNextFileToRead(); } } else { // If no records have been read yet, get the first file to process getNextFileToRead(); } if (this.hasNext) { return this.currentFileItr.next(); } LOG.info("Finished reading records from all files"); return null; } /** * If a previous file has been read, first close that file. Then search through {@link #filesToPull} to find the first * non-empty file. */ private void getNextFileToRead() throws IOException { if (this.currentFile != null && this.currentFileItr != null) { closeCurrentFile(); incrementBytesReadCounter(); } while (!this.hasNext && !this.filesToPull.isEmpty()) { this.currentFile = this.filesToPull.remove(0); this.currentFileItr = downloadFile(this.currentFile); this.hasNext = this.currentFileItr == null ? false : this.currentFileItr.hasNext(); LOG.info("Will start downloading file: " + this.currentFile); } } @SuppressWarnings("unchecked") @Override public S getSchema() { return (S) this.workUnit.getProp(ConfigurationKeys.SOURCE_SCHEMA); } /** * Gets a list of commands that will get the expected record count from the * source, executes the commands, and then parses the output for the count * * @return the expected record count */ @Override public long getExpectedRecordCount() { return -1; } /** * Gets a list of commands that will get the high watermark from the source, * executes the commands, and then parses the output for the watermark * * @return the high watermark */ @Override public long getHighWatermark() { LOG.info("High Watermark is -1 for file based extractors"); return -1; } /** * Downloads a file from the source * * @param file * is the file to download * @return an iterator over the file * TODO Add support for different file formats besides text e.g. avro iterator, byte iterator, json iterator. */ public Iterator<D> downloadFile(String file) throws IOException { return this.fileDownloader.downloadFile(file); } /** * Closes the current file being read. */ public void closeCurrentFile() { try { this.closer.close(); } catch (IOException e) { if (this.currentFile != null) { LOG.error("Failed to close file: " + this.currentFile, e); } } } @Override public void close() throws IOException { try { this.fsHelper.close(); } catch (IOException e) { LOG.error("Could not successfully close file system helper due to error: " + e.getMessage(), e); } } private void incrementBytesReadCounter() { try { this.counters.inc(CounterNames.FileBytesRead, this.fsHelper.getFileSize(this.currentFile)); } catch (FileBasedHelperException e) { LOG.info("Unable to get file size. Will skip increment to bytes counter " + e.getMessage()); LOG.debug(e.getMessage(), e); } catch (UnsupportedOperationException e) { LOG.info("Unable to get file size. Will skip increment to bytes counter " + e.getMessage()); LOG.debug(e.getMessage(), e); } } }