/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.source.extractor.filebased;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import com.opencsv.CSVReader;
import lombok.extern.slf4j.Slf4j;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
/**
* A {@link FileDownloader} that downloads a single file and iterates line by line.
*
* @param <D> record type in the file
*/
@Slf4j
public class CsvFileDownloader extends FileDownloader<String[]> {
public static final String CSV_DOWNLOADER_PREFIX = "source.csv_file.";
public static final String SKIP_TOP_ROWS_REGEX = CSV_DOWNLOADER_PREFIX + "skip_top_rows_regex";
public static final String DELIMITER = CSV_DOWNLOADER_PREFIX + "delimiter";
public CsvFileDownloader(FileBasedExtractor<?, ?> fileBasedExtractor) {
super(fileBasedExtractor);
}
/**
* Provide iterator via OpenCSV's CSVReader.
* Provides a way to skip top rows by providing regex.(This is useful when CSV file comes with comments on top rows, but not in fixed size.
* It also provides validation on schema by matching header names between property's schema and header name in CSV file.
*
* {@inheritDoc}
* @see gobblin.source.extractor.filebased.FileDownloader#downloadFile(java.lang.String)
*/
@SuppressWarnings("unchecked")
@Override
public Iterator<String[]> downloadFile(String file) throws IOException {
log.info("Beginning to download file: " + file);
final State state = fileBasedExtractor.workUnitState;
CSVReader reader;
try {
if (state.contains(DELIMITER)) {
String delimiterStr = state.getProp(DELIMITER).trim();
Preconditions.checkArgument(delimiterStr.length() == 1, "Delimiter should be a character.");
char delimiter = delimiterStr.charAt(0);
log.info("Using " + delimiter + " as a delimiter.");
reader = this.fileBasedExtractor.getCloser().register(
new CSVReader(new InputStreamReader(
this.fileBasedExtractor.getFsHelper().getFileStream(file),
ConfigurationKeys.DEFAULT_CHARSET_ENCODING), delimiter));
} else {
reader = this.fileBasedExtractor.getCloser().register(
new CSVReader(new InputStreamReader(
this.fileBasedExtractor.getFsHelper().getFileStream(file),
ConfigurationKeys.DEFAULT_CHARSET_ENCODING)));
}
} catch (FileBasedHelperException e) {
throw new IOException(e);
}
PeekingIterator<String[]> iterator = Iterators.peekingIterator(reader.iterator());
if (state.contains(SKIP_TOP_ROWS_REGEX)) {
String regex = state.getProp(SKIP_TOP_ROWS_REGEX);
log.info("Trying to skip with regex: " + regex);
while (iterator.hasNext()) {
String[] row = iterator.peek();
if (row.length == 0) {
break;
}
if (!row[0].matches(regex)) {
break;
}
iterator.next();
}
}
if (this.fileBasedExtractor.isShouldSkipFirstRecord() && iterator.hasNext()) {
log.info("Skipping first record");
iterator.next();
}
return iterator;
}
}