package com.thinkbiganalytics.discovery.parsers.csv;
/*-
* #%L
* thinkbig-schema-discovery-default
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import java.util.stream.Collectors;
/**
* Infers CSV format for a sample file
*/
class CSVAutoDetect {
private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(CSVFileSchemaParser.class);
private static <K, V extends Comparable<? super V>> Map<K, V> sortMapByValue(Map<K, V> map) {
return map.entrySet()
.stream()
.sorted(Map.Entry.comparingByValue(Collections.reverseOrder()))
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue,
(e1, e2) -> e1,
LinkedHashMap::new
));
}
/**
* Parses a sample file to allow schema specification when creating a new feed.
*
* @param sampleText the sample text
* @return A configured parser
* @throws IOException If there is an error parsing the sample file
*/
public CSVFormat detectCSVFormat(String sampleText, boolean headerRow) throws IOException {
CSVFormat format = CSVFormat.DEFAULT.withAllowMissingColumnNames();
try (BufferedReader br = new BufferedReader(new StringReader(sampleText))) {
List<LineStats> lineStats = generateStats(br);
Character quote = guessQuote(lineStats);
Character delim = guessDelimiter(lineStats, sampleText, quote, headerRow);
if (delim == null) {
throw new IOException("Unrecognized format");
}
format = format.withDelimiter(delim);
format = format.withQuoteMode(QuoteMode.MINIMAL).withQuote(quote);
}
return format;
}
private List<LineStats> generateStats(BufferedReader br) throws IOException {
List<LineStats> lineStats = new Vector<>();
String line;
int rows = 0;
br.mark(32765);
while ((line = br.readLine()) != null && rows < 100) {
LineStats stats = new LineStats(line);
rows++;
lineStats.add(stats);
}
br.reset();
return lineStats;
}
private boolean hasEscapes(List<LineStats> lineStats) {
for (LineStats lineStat : lineStats) {
if (lineStat.escapes) {
return true;
}
}
return false;
}
private Character guessQuote(List<LineStats> lineStats) {
Character[] quoteTypeSupported = {Character.valueOf('"'), Character.valueOf('\'')};
boolean match = false;
for (Character quoteType : quoteTypeSupported) {
boolean quoteTypeFound = lineStats.stream().anyMatch(lineStat -> lineStat.containsNoDelimCharOfType(quoteType));
if (quoteTypeFound) {
match = lineStats.stream().allMatch(lineStat -> lineStat.hasLegalQuotedStringOfChar(quoteType));
}
if (match) {
return quoteType;
}
}
return CSVFormat.DEFAULT.getQuoteCharacter();
}
private Character guessDelimiter(List<LineStats> lineStats, String value, Character quote, boolean headerRow) throws IOException {
// Assume delimiter exists in first line and compare to subsequent lines
if (lineStats.size() > 0) {
LineStats firstLineStat = lineStats.get(0);
Map<Character, Integer> firstLineDelimCounts = firstLineStat.calcDelimCountsOrdered();
if (firstLineDelimCounts != null && firstLineDelimCounts.size() > 0) {
List<Character> candidates = new ArrayList<>();
// Attempt to parse given delimiter
Set<Character> firstLineDelimKeys = firstLineDelimCounts.keySet();
for (Character delim : firstLineDelimKeys) {
CSVFormat format;
if (headerRow) {
format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withDelimiter(delim).withQuote(quote);
}
else {
format = CSVFormat.DEFAULT.withDelimiter(delim).withQuote(quote);
}
try (StringReader sr = new StringReader(value)) {
try (CSVParser parser = format.parse(sr)) {
if (parser.getHeaderMap() != null) {
int size = parser.getHeaderMap().size();
List<CSVRecord> records = parser.getRecords();
boolean match = records.stream().allMatch(record -> record.size() == size);
if (match) {
return delim;
}
}
}
}
Integer delimCount = firstLineDelimCounts.get(delim);
boolean match = true;
for (int i = 1; i < lineStats.size() && match; i++) {
LineStats thisLine = lineStats.get(i);
Integer rowDelimCount = thisLine.delimStats.get(delim);
match = delimCount.equals(rowDelimCount);
}
if (match) {
candidates.add(delim);
}
}
if (candidates.size() > 0) {
// All agree on a single delimiter
if (candidates.size() == 1) {
return candidates.get(0);
} else {
int count = 0;
// Return highest delimiter from candidates
for (Character delim : firstLineDelimKeys) {
if (candidates.get(count++) != null) {
return delim;
}
}
}
}
}
}
return null;
}
static class LineStats {
Map<Character, Integer> delimStats = new HashMap<Character, Integer>();
Map<Character, Integer> nonDelimStats = new HashMap<Character, Integer>();
Character lastChar;
boolean escapes;
Character firstDelim;
public LineStats(String line) {
// Look for delimiters
char[] chars = line.toCharArray();
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
switch (c) {
case ' ':
increment(' ', true);
break;
case ':':
increment(':', true);
break;
case ';':
increment(';', true);
break;
case ',':
increment(',', true);
break;
case '|':
increment('|', true);
break;
case '\t':
increment('\t', true);
break;
case '+':
increment('+', true);
break;
case '"':
increment('"', false);
break;
case '\'':
increment('\'', false);
break;
case '<':
increment('<', false);
break;
case '>':
increment('>', false);
break;
case '\\':
increment('\\', false);
break;
default:
}
}
}
boolean containsNoDelimCharOfType(Character quoteType) {
Integer count = nonDelimStats.get(quoteType);
return (count != null);
}
boolean hasLegalQuotedStringOfChar(Character quoteType) {
Integer count = nonDelimStats.get(quoteType);
return (count == null || count.intValue() % 2 == 0);
}
void increment(Character c, boolean delim) {
if (delim) {
if (lastChar == null || (lastChar != '\\') || (lastChar == '\\' && c == '\\')) {
Integer val = delimStats.get(c);
val = (val == null ? 1 : val.intValue() + 1);
if (val == 1) {
firstDelim = c;
}
delimStats.put(c, val);
} else {
escapes = true;
}
} else {
Integer val = nonDelimStats.get(c);
val = (val == null ? 1 : val.intValue() + 1);
nonDelimStats.put(c, val);
}
lastChar = c;
}
public Map<Character, Integer> calcDelimCountsOrdered() {
return sortMapByValue(delimStats);
}
}
}