package org.embulk.standards; import com.google.common.base.Optional; import com.google.common.collect.ImmutableSet; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonValue; import org.embulk.config.Task; import org.embulk.config.Config; import org.embulk.config.ConfigDefault; import org.embulk.config.ConfigSource; import org.embulk.config.ConfigException; import org.embulk.config.TaskSource; import org.embulk.spi.time.TimestampParser; import org.embulk.spi.time.TimestampParseException; import org.embulk.spi.json.JsonParser; import org.embulk.spi.json.JsonParseException; import org.embulk.spi.Column; import org.embulk.spi.Schema; import org.embulk.spi.SchemaConfig; import org.embulk.spi.ColumnVisitor; import org.embulk.spi.PageBuilder; import org.embulk.spi.ParserPlugin; import org.embulk.spi.Exec; import org.embulk.spi.FileInput; import org.embulk.spi.PageOutput; import org.embulk.spi.DataException; import org.embulk.spi.util.LineDecoder; import org.embulk.spi.util.Timestamps; import org.slf4j.Logger; public class CsvParserPlugin implements ParserPlugin { private static final ImmutableSet<String> TRUE_STRINGS = ImmutableSet.of( "true", "True", "TRUE", "yes", "Yes", "YES", "t", "T", "y", "Y", "on", "On", "ON", "1"); public interface PluginTask extends Task, LineDecoder.DecoderTask, TimestampParser.Task { @Config("columns") SchemaConfig getSchemaConfig(); @Config("header_line") @ConfigDefault("null") Optional<Boolean> getHeaderLine(); @Config("skip_header_lines") @ConfigDefault("0") int getSkipHeaderLines(); void setSkipHeaderLines(int n); @Config("delimiter") @ConfigDefault("\",\"") String getDelimiter(); @Config("quote") @ConfigDefault("\"\\\"\"") Optional<QuoteCharacter> getQuoteChar(); @Config("escape") @ConfigDefault("\"\\\\\"") Optional<EscapeCharacter> getEscapeChar(); // Null value handling: if the CsvParser found 'non-quoted empty string's, // it replaces them to string that users specified like "\N", "NULL". @Config("null_string") @ConfigDefault("null") Optional<String> getNullString(); @Config("trim_if_not_quoted") @ConfigDefault("false") boolean getTrimIfNotQuoted(); @Config("max_quoted_size_limit") @ConfigDefault("131072") //128kB long getMaxQuotedSizeLimit(); @Config("comment_line_marker") @ConfigDefault("null") Optional<String> getCommentLineMarker(); @Config("allow_optional_columns") @ConfigDefault("false") boolean getAllowOptionalColumns(); @Config("allow_extra_columns") @ConfigDefault("false") boolean getAllowExtraColumns(); @Config("stop_on_invalid_record") @ConfigDefault("false") boolean getStopOnInvalidRecord(); } public static class QuoteCharacter { private final char character; public QuoteCharacter(char character) { this.character = character; } public static QuoteCharacter noQuote() { return new QuoteCharacter(CsvTokenizer.NO_QUOTE); } @JsonCreator public static QuoteCharacter ofString(String str) { if (str.length() >= 2) { throw new ConfigException("\"quote\" option accepts only 1 character."); } else if (str.isEmpty()) { Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"quote\" option is obsoleted. Currently it becomes '\"' automatically but this behavior will be removed. Please set '\"' explicitly."); return new QuoteCharacter('"'); } else { return new QuoteCharacter(str.charAt(0)); } } @JsonIgnore public char getCharacter() { return character; } @JsonValue public String getOptionalString() { return new String(new char[] { character }); } @Override public boolean equals(Object obj) { if (!(obj instanceof QuoteCharacter)) { return false; } QuoteCharacter o = (QuoteCharacter) obj; return character == o.character; } } public static class EscapeCharacter { private final char character; public EscapeCharacter(char character) { this.character = character; } public static EscapeCharacter noEscape() { return new EscapeCharacter(CsvTokenizer.NO_ESCAPE); } @JsonCreator public static EscapeCharacter ofString(String str) { if (str.length() >= 2) { throw new ConfigException("\"escape\" option accepts only 1 character."); } else if (str.isEmpty()) { Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"escape\" option is obsoleted. Currently it becomes null automatically but this behavior will be removed. Please set \"escape: null\" explicitly."); return noEscape(); } else { return new EscapeCharacter(str.charAt(0)); } } @JsonIgnore public char getCharacter() { return character; } @JsonValue public String getOptionalString() { return new String(new char[] { character }); } @Override public boolean equals(Object obj) { if (!(obj instanceof EscapeCharacter)) { return false; } EscapeCharacter o = (EscapeCharacter) obj; return character == o.character; } } private final Logger log; public CsvParserPlugin() { log = Exec.getLogger(CsvParserPlugin.class); } @Override public void transaction(ConfigSource config, ParserPlugin.Control control) { PluginTask task = config.loadConfig(PluginTask.class); // backward compatibility if (task.getHeaderLine().isPresent()) { if (task.getSkipHeaderLines() > 0) { throw new ConfigException("'header_line' option is invalid if 'skip_header_lines' is set."); } if (task.getHeaderLine().get()) { task.setSkipHeaderLines(1); } else { task.setSkipHeaderLines(0); } } control.run(task.dump(), task.getSchemaConfig().toSchema()); } @Override public void run(TaskSource taskSource, final Schema schema, FileInput input, PageOutput output) { PluginTask task = taskSource.loadTask(PluginTask.class); final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig()); final JsonParser jsonParser = new JsonParser(); final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task); final boolean allowOptionalColumns = task.getAllowOptionalColumns(); final boolean allowExtraColumns = task.getAllowExtraColumns(); final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord(); final int skipHeaderLines = task.getSkipHeaderLines(); try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) { while (tokenizer.nextFile()) { // skip the header lines for each file for (int skipHeaderLineNumber = skipHeaderLines; skipHeaderLineNumber > 0; skipHeaderLineNumber--) { if (!tokenizer.skipHeaderLine()) { break; } } if (!tokenizer.nextRecord()) { // empty file continue; } while (true) { boolean hasNextRecord; try { schema.visitColumns(new ColumnVisitor() { public void booleanColumn(Column column) { String v = nextColumn(); if (v == null) { pageBuilder.setNull(column); } else { pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v)); } } public void longColumn(Column column) { String v = nextColumn(); if (v == null) { pageBuilder.setNull(column); } else { try { pageBuilder.setLong(column, Long.parseLong(v)); } catch (NumberFormatException e) { // TODO support default value throw new CsvRecordValidateException(e); } } } public void doubleColumn(Column column) { String v = nextColumn(); if (v == null) { pageBuilder.setNull(column); } else { try { pageBuilder.setDouble(column, Double.parseDouble(v)); } catch (NumberFormatException e) { // TODO support default value throw new CsvRecordValidateException(e); } } } public void stringColumn(Column column) { String v = nextColumn(); if (v == null) { pageBuilder.setNull(column); } else { pageBuilder.setString(column, v); } } public void timestampColumn(Column column) { String v = nextColumn(); if (v == null) { pageBuilder.setNull(column); } else { try { pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v)); } catch (TimestampParseException e) { // TODO support default value throw new CsvRecordValidateException(e); } } } public void jsonColumn(Column column) { String v = nextColumn(); if (v == null) { pageBuilder.setNull(column); } else { try { pageBuilder.setJson(column, jsonParser.parse(v)); } catch (JsonParseException e) { // TODO support default value throw new CsvRecordValidateException(e); } } } private String nextColumn() { if (allowOptionalColumns && !tokenizer.hasNextColumn()) { //TODO warning return null; } return tokenizer.nextColumnOrNull(); } }); try { hasNextRecord = tokenizer.nextRecord(); } catch (CsvTokenizer.TooManyColumnsException ex) { if (allowExtraColumns) { String tooManyColumnsLine = tokenizer.skipCurrentLine(); // TODO warning hasNextRecord = tokenizer.nextRecord(); } else { // this line will be skipped at the following catch section throw ex; } } pageBuilder.addRecord(); } catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) { String skippedLine = tokenizer.skipCurrentLine(); long lineNumber = tokenizer.getCurrentLineNumber(); if (stopOnInvalidRecord) { throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e); } log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine)); //exec.notice().skippedLine(skippedLine); hasNextRecord = tokenizer.nextRecord(); } if (!hasNextRecord) { break; } } } pageBuilder.finish(); } } static class CsvRecordValidateException extends DataException { CsvRecordValidateException(Throwable cause) { super(cause); } } }