/* * Apache License * Version 2.0, January 2004 * http://www.apache.org/licenses/ * * Copyright 2013 Aurelian Tutuianu * Copyright 2014 Aurelian Tutuianu * Copyright 2015 Aurelian Tutuianu * Copyright 2016 Aurelian Tutuianu * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package rapaio.io; import rapaio.data.*; import rapaio.util.func.SPredicate; import java.io.*; import java.text.DecimalFormat; import java.util.*; import java.util.function.Predicate; import java.util.zip.GZIPInputStream; import static java.util.stream.Collectors.toSet; /** * Comma separated file reader and writer utility. * * @author <a href="mailto:padreati@yahoo.com">Aurelian Tutuianu</a> */ public class Csv { private boolean trimSpaces = true; private boolean header = true; private boolean quotes = false; private char separatorChar = ','; private char escapeChar = '\"'; private HashMap<String, VarType> typeFieldHints = new HashMap<>(); private HashSet<String> naValues = new HashSet<>(); private VarType[] defaultTypes = new VarType[]{VarType.BINARY, VarType.INDEX, VarType.NUMERIC, VarType.NOMINAL}; private int startRow = 0; private int endRow = Integer.MAX_VALUE; private Predicate<Integer> skipRows = row -> false; private Predicate<Integer> skipCols = row -> false; private Frame template; public Csv() { naValues.add("?"); } public Csv withHeader(boolean hasHeader) { this.header = hasHeader; return this; } public Csv withSeparatorChar(char separator) { this.separatorChar = separator; return this; } public Csv withQuotes(boolean quotes) { this.quotes = quotes; return this; } public Csv withEscapeChar(char escapeChar) { this.escapeChar = escapeChar; return this; } public Csv withTrimSpaces(boolean trimSpaces) { this.trimSpaces = trimSpaces; return this; } public Csv withStartRow(int startRow) { this.startRow = startRow; return this; } public Csv withEndRow(int endRow) { this.endRow = endRow; return this; } public Csv withRows(int... rows) { final Set<Integer> skip = Arrays.stream(rows).boxed().collect(toSet()); skipRows = row -> !skip.contains(row); return this; } public Csv withRows(Predicate<Integer> p) { skipRows = p.negate(); return this; } public Csv withSkipRows(int... rows) { final Set<Integer> skip = Arrays.stream(rows).boxed().collect(toSet()); skipRows = skip::contains; return this; } public Csv withSkipRows(Predicate<Integer> p) { skipRows = p; return this; } public Csv withCols(int... cols) { final Set<Integer> skip = Arrays.stream(cols).boxed().collect(toSet()); skipCols = row -> !skip.contains(row); return this; } public Csv withCols(Predicate<Integer> p) { skipCols = p.negate(); return this; } public Csv withSkipCols(int... cols) { Set<Integer> skip = Arrays.stream(cols).boxed().collect(toSet()); skipCols = skip::contains; return this; } public Csv withSkipCols(SPredicate<Integer> p) { skipCols = p; return this; } public Csv withTypes(VarType varType, String... fields) { Arrays.stream(fields).forEach(field -> typeFieldHints.put(field, varType)); return this; } public Csv withDefaultTypes(VarType... defaultTypes) { this.defaultTypes = defaultTypes; return this; } public Csv withNAValues(String... values) { this.naValues = new HashSet<>(); Collections.addAll(naValues, values); return this; } public Csv withTemplate(Frame template) { this.template = template; return this; } public Frame read(File file) { try { return read(new FileInputStream(file)); } catch (IOException e) { throw new RuntimeException("error at reading file: " + file.getAbsolutePath(), e); } } public Frame readGz(File file) { try { return read(new GZIPInputStream(new FileInputStream(file))); } catch (IOException e) { throw new RuntimeException("error at reading file", e); } } public Frame read(String fileName) { try { return read(new FileInputStream(fileName)); } catch (IOException e) { throw new RuntimeException("error at reading file", e); } } public Frame read(Class<?> clazz, String resource) throws IOException { InputStream is = clazz.getResourceAsStream(resource); if (is == null) { throw new IOException("resource: " + resource + " not found in the path of given class: " + clazz.getCanonicalName()); } return read(is); } public Frame read(InputStream inputStream) throws IOException { int rows = 0; int allRowsNum = 0; List<String> names = new ArrayList<>(); List<VarSlot> varSlots = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) { if (header) { String line = reader.readLine(); if (line == null) { return null; } names = parseLine(line); } while (skipRows.test(allRowsNum)) { reader.readLine(); allRowsNum += 1; } boolean first = true; while (true) { String line = reader.readLine(); if (line == null) { break; } allRowsNum += 1; if (skipRows.test(allRowsNum - 1)) { continue; } // build vectors with initial types if (first) { List<String> row = parseLine(line); first = false; for (int i = names.size(); i < row.size(); i++) { names.add("V" + (i + 1)); } for (String colName : names) { if (template != null) { String[] vn = template.varNames(); boolean found = false; for (String name : vn) { if (name.equals(colName)) { found = true; break; } } if (found) { varSlots.add(new VarSlot(this, template.var(colName), 0)); continue; } } if (typeFieldHints.containsKey(colName)) { varSlots.add(new VarSlot(this, typeFieldHints.get(colName), 0)); } else { // default type varSlots.add(new VarSlot(this, 0)); } } } if (rows < startRow) { rows++; continue; } if (rows == endRow) break; List<String> row = parseLine(line); rows++; int len = Math.max(row.size(), names.size()); for (int i = 0; i < len; i++) { // we have a value in row for which we did not defined a var slot if (i >= varSlots.size()) { names.add("V" + (i + 1)); varSlots.add(new VarSlot(this, varSlots.get(0).var.rowCount())); continue; } // we have missing values at the end of the row if (i >= row.size()) { varSlots.get(i).addValue("?"); continue; } // gaussian behavior varSlots.get(i).addValue(row.get(i)); } } } List<Var> variables = new ArrayList<>(); for (int i = 0; i < varSlots.size(); i++) { String name = names.size() > i ? names.get(i) : "V" + (i + 1); variables.add(varSlots.get(i).var().withName(name)); } return SolidFrame.byVars(rows - startRow, variables); } List<String> parseLine(String line) { List<String> data = new ArrayList<>(); int start = 0; int colNum = 0; int end; while (start < line.length()) { end = start; boolean inQuotas = false; while (end < line.length()) { char ch = line.charAt(end++); if (!inQuotas && ch == '"') { inQuotas = true; continue; } if (inQuotas && ch == escapeChar) { if (end < line.length() && line.charAt(end) == '\"') { end++; continue; } } if (inQuotas && ch == '"') { if (escapeChar == '\"') { if (end < line.length() && line.charAt(end) == '\"') { end++; continue; } } inQuotas = false; continue; } if (!inQuotas && (ch == separatorChar)) { end--; break; } } if (!skipCols.test(colNum)) { data.add(clean(line.substring(start, end))); } start = end + 1; colNum += 1; } return data; } /** * Clean the string token. - remove trailing and leading spaces, before and * after removing quotes - remove leading and trailing quotes - remove * escape quota character * * @param tok if (trimSpaces) { * @return string cleaned */ private String clean(String tok) { if (trimSpaces) { tok = tok.trim(); } if (quotes && !tok.isEmpty()) { if (tok.charAt(0) == '\"') { tok = tok.substring(1); } if (tok.charAt(tok.length() - 1) == '\"') { tok = tok.substring(0, tok.length() - 1); } } if (quotes) { char[] line = new char[tok.length()]; int len = 0; for (int i = 0; i < tok.length(); i++) { if (len < tok.length() - 1 && tok.charAt(i) == escapeChar && tok.charAt(i + 1) == '\"') { line[len++] = '\"'; i++; continue; } line[len++] = tok.charAt(i); } tok = String.valueOf(line, 0, len); } if (trimSpaces) { tok = tok.trim(); } return tok; } public void write(Frame df, File file) throws IOException { try (OutputStream os = new FileOutputStream(file)) { write(df, os); } } public void write(Frame df, String fileName) { try { try (OutputStream os = new FileOutputStream(fileName)) { write(df, os); } } catch (IOException e) { throw new RuntimeException("error at writing file", e); } } public void write(Frame df, OutputStream os) throws IOException { try (PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(os)))) { if (header) { for (int i = 0; i < df.varNames().length; i++) { if (i != 0) { writer.append(separatorChar); } writer.append(df.varNames()[i]); } writer.append("\n"); } DecimalFormat format = new DecimalFormat("0.###############################"); for (int i = 0; i < df.rowCount(); i++) { for (int j = 0; j < df.varCount(); j++) { if (j != 0) { writer.append(separatorChar); } if (df.var(j).missing(i)) { writer.append("?"); continue; } if (df.var(j).type().isNominal() || df.var(j).type().equals(VarType.TEXT)) { writer.append(unclean(df.label(i, j))); } else { writer.append(format.format(df.value(i, j))); } } writer.append("\n"); } writer.flush(); } } private String unclean(String label) { char[] line = new char[label.length() * 2]; int len = 0; for (int i = 0; i < label.length(); i++) { if (label.charAt(i) == '\"') { line[len++] = escapeChar; } line[len++] = label.charAt(i); } label = String.valueOf(line, 0, len); if (quotes) { label = "\"" + label + "\""; } return label; } static class VarSlot { private final Csv parent; private final VarType type; private Var var; private Text text; /** * Constructor for slot which does not have a predefined type, it tries the best by using default types */ public VarSlot(Csv parent, int rows) { this.parent = parent; this.type = null; this.var = parent.defaultTypes[0].newInstance(rows); this.text = Text.empty(); } public VarSlot(Csv parent, VarType varType, int rows) { this.parent = parent; this.type = varType; this.var = varType.newInstance(rows); this.text = null; } public VarSlot(Csv parent, Var template, int rows) { this.parent = parent; this.type = template.type(); this.var = template.newInstance(rows); this.text = null; } public void addValue(String value) { if (parent.naValues.contains(value)) { value = "?"; } if (type == null) { // for default values while (true) { // try first to add value to the current default type try { var.addLabel(value); if (text != null) { text.addLabel(value); } return; } catch (Throwable th) { // if it's the last default type, than nothing else could be done if (var.type() == parent.defaultTypes[parent.defaultTypes.length - 1]) { throw new IllegalArgumentException( String.format("Could not parse value %s in type %s. Error: %s", value, var.type(), th.getMessage())); } } // have to find an upgrade // find current default type position int pos = 0; for (int i = 0; i < parent.defaultTypes.length; i++) { if (!parent.defaultTypes[i].equals(var.type())) continue; pos = i + 1; break; } // try successive default type upgrades, if the last available fails also than throw an exception for (int i = pos; i < parent.defaultTypes.length; i++) { try { var = parent.defaultTypes[i].newInstance(); if (text != null && text.rowCount() > 0) text.stream().forEach(s -> var.addLabel(s.label())); if (i == parent.defaultTypes.length - 1) text = null; break; } catch (Exception th) { if (i == parent.defaultTypes.length - 1) { throw new IllegalArgumentException( String.format("Could not parse value %s in type %s. Error: %s", value, var.type(), th.getMessage())); } } } } } else { // for non-default values try { var.addLabel(value); } catch (Throwable th) { throw new IllegalArgumentException( String.format("Could not parse value %s in type %s for variable with name: %s. Error: %s", value, var.type(), var.name(), th.getMessage())); } } } public Var var() { return var; } } }