/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.source.extractor.utils; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StreamTokenizer; import java.util.ArrayList; import gobblin.configuration.ConfigurationKeys; /** * Reads data from inputStream or bufferedReader and gives records as a list * * @author nveeramr */ public class InputStreamCSVReader { private final StreamTokenizer parser; private final char separator; private int maxFieldCount; private boolean atEOF; public InputStreamCSVReader(Reader input) { this(new BufferedReader(input)); } public InputStreamCSVReader(InputStream input) { this(new InputStreamReader(input, ConfigurationKeys.DEFAULT_CHARSET_ENCODING)); } public InputStreamCSVReader(BufferedReader input) { this(input, ',', '\"'); } public InputStreamCSVReader(String input) { this(new InputStreamReader(new ByteArrayInputStream(input.getBytes(ConfigurationKeys.DEFAULT_CHARSET_ENCODING)), ConfigurationKeys.DEFAULT_CHARSET_ENCODING), ',', '\"'); } public InputStreamCSVReader(Reader input, char customizedSeparator) { this(new BufferedReader(input), customizedSeparator, '\"'); } public InputStreamCSVReader(InputStream input, char customizedSeparator) { this(new InputStreamReader(input, ConfigurationKeys.DEFAULT_CHARSET_ENCODING), customizedSeparator, '\"'); } public InputStreamCSVReader(BufferedReader input, char customizedSeparator) { this(input, customizedSeparator, '\"'); } public InputStreamCSVReader(String input, char customizedSeparator) { this(new InputStreamReader(new ByteArrayInputStream(input.getBytes(ConfigurationKeys.DEFAULT_CHARSET_ENCODING)), ConfigurationKeys.DEFAULT_CHARSET_ENCODING), customizedSeparator, '\"'); } public InputStreamCSVReader(Reader input, char customizedSeparator, char enclosedChar) { this(new BufferedReader(input), customizedSeparator, enclosedChar); } public InputStreamCSVReader(InputStream input, char customizedSeparator, char enclosedChar) { this(new InputStreamReader(input, ConfigurationKeys.DEFAULT_CHARSET_ENCODING), customizedSeparator, enclosedChar); } public InputStreamCSVReader(String input, char customizedSeparator, char enclosedChar) { this(new InputStreamReader(new ByteArrayInputStream(input.getBytes(ConfigurationKeys.DEFAULT_CHARSET_ENCODING)), ConfigurationKeys.DEFAULT_CHARSET_ENCODING), customizedSeparator, enclosedChar); } public InputStreamCSVReader(BufferedReader input, char separator, char enclosedChar) { this.separator = separator; // parser settings for the separator and escape chars this.parser = new StreamTokenizer(input); this.parser.ordinaryChars(0, 255); this.parser.wordChars(0, 255); this.parser.ordinaryChar(enclosedChar); this.parser.ordinaryChar(separator); this.parser.eolIsSignificant(true); this.parser.whitespaceChars('\n', '\n'); this.parser.whitespaceChars('\r', '\r'); this.atEOF = false; } public ArrayList<String> splitRecord() throws IOException { ArrayList<String> record = this.getNextRecordFromStream(); return record; } public ArrayList<String> nextRecord() throws IOException { ArrayList<String> record = this.getNextRecordFromStream(); // skip record if it is empty while (record != null) { boolean emptyLine = false; if (record.size() == 0) { emptyLine = true; } else if (record.size() == 1) { String val = record.get(0); if (val == null || val.length() == 0) { emptyLine = true; } } if (emptyLine) { record = getNextRecordFromStream(); } else { break; } } return record; } private ArrayList<String> getNextRecordFromStream() throws IOException { if (this.atEOF) { return null; } ArrayList<String> record = new ArrayList<>(this.maxFieldCount); StringBuilder fieldValue = null; while (true) { int token = this.parser.nextToken(); if (token == StreamTokenizer.TT_EOF) { addField(record, fieldValue); this.atEOF = true; break; } if (token == StreamTokenizer.TT_EOL) { addField(record, fieldValue); break; } if (token == this.separator) { addField(record, fieldValue); fieldValue = null; continue; } if (token == StreamTokenizer.TT_WORD) { if (fieldValue != null) { throw new CSVParseException("Unknown error", this.parser.lineno()); } fieldValue = new StringBuilder(this.parser.sval); continue; } if (token == '"') { if (fieldValue != null) { throw new CSVParseException("Found unescaped quote. A value with quote should be within a quote", this.parser.lineno()); } while (true) { token = this.parser.nextToken(); if (token == StreamTokenizer.TT_EOF) { this.atEOF = true; throw new CSVParseException("EOF reached before closing an opened quote", this.parser.lineno()); } if (token == this.separator) { fieldValue = appendFieldValue(fieldValue, token); continue; } if (token == StreamTokenizer.TT_EOL) { fieldValue = appendFieldValue(fieldValue, "\n"); continue; } if (token == StreamTokenizer.TT_WORD) { fieldValue = appendFieldValue(fieldValue, this.parser.sval); continue; } if (token == '"') { int nextToken = this.parser.nextToken(); if (nextToken == '"') { fieldValue = appendFieldValue(fieldValue, nextToken); continue; } if (nextToken == StreamTokenizer.TT_WORD) { throw new CSVParseException("Not expecting more text after end quote", this.parser.lineno()); } this.parser.pushBack(); break; } } } } if (record.size() > this.maxFieldCount) { this.maxFieldCount = record.size(); } return record; } private static StringBuilder appendFieldValue(StringBuilder fieldValue, int token) { return appendFieldValue(fieldValue, "" + (char) token); } private static StringBuilder appendFieldValue(StringBuilder fieldValue, String token) { if (fieldValue == null) { fieldValue = new StringBuilder(); } fieldValue.append(token); return fieldValue; } private static void addField(ArrayList<String> record, StringBuilder fieldValue) { record.add(fieldValue == null ? null : fieldValue.toString()); } public static class CSVParseException extends IOException { private static final long serialVersionUID = 1L; final int recordNumber; CSVParseException(String message, int lineno) { super(message); this.recordNumber = lineno; } CSVParseException(int i) { this.recordNumber = i; } public int getRecordNumber() { return this.recordNumber; } } }