/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.utils; import java.io.IOException; import java.util.EnumMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import org.supercsv.exception.SuperCsvException; import org.supercsv.io.ICsvListReader; import org.voltcore.logging.VoltLogger; import org.voltdb.VoltType; import org.voltdb.client.Client; import org.voltdb.common.Constants; import com.google_voltpatches.common.collect.BiMap; import com.google_voltpatches.common.collect.HashBiMap; /** * * This is a single thread reader which feeds the lines after validating syntax * to CSVDataLoader. * */ class CSVFileReader implements Runnable { private static final String COLUMN_COUNT_ERROR = "Incorrect number of columns. %d found, %d expected. Please check the table schema " + "and the line content"; private static final String HEADER_COUNT_ERROR = "Incorrect number of columns. %d found, %d expected. Please check the csv file header " + "and the line content"; private static final String BLANK_ERROR = "A blank value is detected in column %d while \"--blank error\" is used. " + "To proceed, either fill in the blank column or use \"--blank {null|empty}\"."; private static final String WHITESPACE_ERROR = "Whitespace detected in column %d while --nowhitespace is used. " + "To proceed, either remove the whitespaces from the column or remove --nowhitespace."; static AtomicLong m_totalRowCount = new AtomicLong(0); static AtomicLong m_totalLineCount = new AtomicLong(0); static CSVLoader.CSVConfig m_config = null; static Client m_csvClient = null; static ICsvListReader m_listReader = null; long m_parsingTime = 0; private static final Map<VoltType, String> m_blankStrings = new EnumMap<VoltType, String>(VoltType.class); private static final VoltLogger m_log = new VoltLogger("CSVLOADER"); private final CSVDataLoader m_loader; private final BulkLoaderErrorHandler m_errHandler; private final VoltType[] m_columnTypes; private final int m_columnCount; private int headerlen; private Integer[] order; static { m_blankStrings.put(VoltType.TINYINT, "0"); m_blankStrings.put(VoltType.SMALLINT, "0"); m_blankStrings.put(VoltType.INTEGER, "0"); m_blankStrings.put(VoltType.BIGINT, "0"); m_blankStrings.put(VoltType.FLOAT, "0.0"); m_blankStrings.put(VoltType.TIMESTAMP, null); m_blankStrings.put(VoltType.STRING, ""); m_blankStrings.put(VoltType.DECIMAL, "0.0"); m_blankStrings.put(VoltType.VARBINARY, ""); } public static void initializeReader(CSVLoader.CSVConfig config, Client csvClient, ICsvListReader reader) { m_config = config; m_csvClient = csvClient; m_listReader = reader; } public CSVFileReader(CSVDataLoader loader, BulkLoaderErrorHandler errorHandler) { m_loader = loader; m_errHandler = errorHandler; m_columnTypes = m_loader.getColumnTypes(); m_columnCount = m_columnTypes.length; } @Override public void run() { List<String> lineList; //if header option is true, check whether csv first line is valid if (m_config.header) { if (!checkHeader()) { m_log.error("In the CSV file " + m_config.file + ", the header "+ m_listReader.getUntokenizedRow() +" does not match " + "an existing column in the table " + m_config.table + "."); System.exit(-1); } } while ((m_config.limitrows-- > 0)) { if (m_errHandler.hasReachedErrorLimit()) { break; } try { //Initial setting of m_totalLineCount if (m_listReader.getLineNumber() == 0) { m_totalLineCount.set(m_config.skip); } else { m_totalLineCount.set(m_listReader.getLineNumber()); } long st = System.nanoTime(); lineList = m_listReader.read(); long end = System.nanoTime(); m_parsingTime += (end - st); if (lineList == null) { if (m_totalLineCount.get() > m_listReader.getLineNumber()) { m_totalLineCount.set(m_listReader.getLineNumber()); } break; } m_totalRowCount.incrementAndGet(); if (lineList.isEmpty()) { continue; } String[] lineValues = lineList.toArray(new String[0]); String lineCheckResult; String[] reorderValues = new String[m_columnCount]; if ((lineCheckResult = checkparams_trimspace_reorder(lineValues, reorderValues)) != null) { final RowWithMetaData metaData = new RowWithMetaData(m_listReader.getUntokenizedRow(), m_totalLineCount.get() + 1); if (m_errHandler.handleError(metaData, null, lineCheckResult)) { break; } continue; } RowWithMetaData lineData = new RowWithMetaData(m_listReader.getUntokenizedRow(), m_listReader.getLineNumber()); m_loader.insertRow(lineData, reorderValues); } catch (SuperCsvException e) { //Catch rows that can not be read by superCSV m_listReader. // e.g. items without quotes when strictquotes is enabled. final RowWithMetaData metaData = new RowWithMetaData(m_listReader.getUntokenizedRow(), m_totalLineCount.get() + 1); if (m_errHandler.handleError(metaData, null, e.getMessage())) { break; } } catch (IOException ex) { m_log.error("Failed to read CSV line from file: " + ex); break; } catch (InterruptedException e) { m_log.error("CSVLoader interrupted: " + e); break; } } //Now wait for processors to see endOfData and count down. After that drain to finish all callbacks try { m_log.debug("Waiting for CSVDataLoader to finish."); m_loader.close(); m_log.debug("CSVDataLoader Done."); } catch (Exception ex) { m_log.warn("Stopped processing because of connection error. " + "A report will be generated with what we processed so far. Error: " + ex); } } private boolean checkHeader() { try { String[] firstline = m_listReader.getHeader(false); Set<String> firstset = new HashSet<String>(); BiMap<Integer, String> colNames = HashBiMap.create(m_loader.getColumnNames()); headerlen = firstline.length; // remove duplicate. for (String name : firstline) { if (name != null) { firstset.add(name.toUpperCase()); } else { return false; } } // whether column num matches. if (headerlen < m_columnCount) { return false; } else { // whether column name has according table column. int matchColCount = 0; for (String name : firstset) { if (colNames.containsValue(name.trim())) { matchColCount++; } } if (matchColCount != m_columnCount) { return false; } } // get the mapping from file column num to table column num. order = new Integer[headerlen]; for (int fileCol = 0; fileCol < headerlen; fileCol++) { String name = firstline[fileCol]; Integer tableCol = colNames.inverse().get(name.trim().toUpperCase()); order[fileCol] = tableCol; } } catch (IOException ex) { m_log.error("Failed to read CSV line from file: " + ex); } return true; } private String checkparams_trimspace_reorder(String[] lineValues, String[] reorderValues) { if (lineValues.length != m_columnCount && !m_config.header) { return String.format(COLUMN_COUNT_ERROR, lineValues.length, m_columnCount); } if (lineValues.length != headerlen && m_config.header) { return String.format(HEADER_COUNT_ERROR, lineValues.length, headerlen); } for (int fileCol = 0; fileCol<lineValues.length; fileCol++) { int i = fileCol; if (m_config.header) { if (order[fileCol] != null) { i = order[fileCol]; } else { continue; } } reorderValues[i] = lineValues[fileCol]; //supercsv read "" to null if (reorderValues[i] == null) { if (m_config.blank.equalsIgnoreCase("error")) { return String.format(BLANK_ERROR, i + 1); } else if (m_config.blank.equalsIgnoreCase("empty")) { reorderValues[i] = m_blankStrings.get(m_columnTypes[i]); } //else m_config.blank == null which is already the case } // trim white space in this correctedLine. SuperCSV preserves all the whitespace by default else { if (m_config.nowhitespace && (reorderValues[i].charAt(0) == ' ' || reorderValues[i].charAt(reorderValues[i].length() - 1) == ' ')) { return String.format(WHITESPACE_ERROR, i + 1); } else { reorderValues[i] = reorderValues[i].trim(); } if(!m_config.customNullString.isEmpty()){ if(lineValues[i].equals(m_config.customNullString)){ reorderValues[i] = null; } } // treat NULL, \N and "\N" as actual null value else if (reorderValues[i].equals("NULL") || reorderValues[i].equals(Constants.CSV_NULL) || reorderValues[i].equals(Constants.QUOTED_CSV_NULL)) { reorderValues[i] = null; } } } return null; } }