/** * Global Sensor Networks (GSN) Source Code * Copyright (c) 2006-2016, Ecole Polytechnique Federale de Lausanne (EPFL) * * This file is part of GSN. * * GSN is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * GSN is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GSN. If not, see <http://www.gnu.org/licenses/>. * * File: src/ch/epfl/gsn/wrappers/general/CSVHandler.java * * @author Ali Salehi * @author Mehdi Riahi * @author Sofiane Sarni * */ package ch.epfl.gsn.wrappers.general; import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.Serializable; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import au.com.bytecode.opencsv.CSVReader; import ch.epfl.gsn.beans.DataField; import ch.epfl.gsn.utils.CaseInsensitiveComparator; /** * possible formats for the timestamp fields are available @ http://joda-time.sourceforge.net/api-release/org/joda/time/format/DateTimeFormat.html * Possible timezone : http://joda-time.sourceforge.net/timezones.html */ public class CSVHandler { public static final String LOCAL_TIMEZONE_ID = DateTimeZone.getDefault().getID(); private static Logger logger = LoggerFactory.getLogger(CSVHandler.class); private static final String TIMESTAMP = "timed"; public static DateTime parseTimeStamp(String format, String value) throws IllegalArgumentException { DateTimeFormatter fmt = DateTimeFormat.forPattern(format); return fmt.parseDateTime(value); } private char stringSeparator, separator; private String dataFile; private DateTimeZone timeZone; private int skipFirstXLines; private String[] fields, formats, nulls; private String checkPointFile; public boolean initialize(String dataFile, String inFields, String inFormats, char separator, char stringSeparator, int skipFirstXLines, String nullValues) { return initialize(dataFile, inFields, inFormats, separator, stringSeparator, skipFirstXLines, nullValues, LOCAL_TIMEZONE_ID, "check-poin/" + (new File(dataFile).getName() + ".chk-point")); } public boolean initialize(String dataFile, String inFields, String inFormats, char separator, char stringSeparator, int skipFirstXLines, String nullValues, String timeZone, String checkpointFile) { this.stringSeparator = stringSeparator; // default to , this.skipFirstXLines = skipFirstXLines;// default to 0 this.dataFile = dataFile; // check if it exist. this.separator = separator; this.timeZone = DateTimeZone.forID(timeZone); this.checkPointFile = checkpointFile; File file = new File(dataFile); if (!file.isFile()) { logger.error("The specified CSV data file: " + dataFile + " doesn't exists."); return false; } try { setupCheckPointFileIfNeeded(); this.fields = generateFieldIdx(inFields, true); this.formats = generateFieldIdx(inFormats, false); this.nulls = generateFieldIdx(nullValues, true); //////////////////////// // TODO: Check that the lengths are the same //////////////////////// } catch (IOException e) { logger.error(e.getMessage(), e); return false; } if (!validateFormats(this.formats)) return false; if (fields.length != formats.length) { logger.error("loading the csv-wrapper failed as the length of fields(" + fields.length + ") doesn't match the length of formats(" + formats.length + ")"); return false; } return true; } public void setupCheckPointFileIfNeeded() throws IOException { String chkPointDir = new File(new File(getCheckPointFile()).getParent()).getAbsolutePath(); new File(chkPointDir).mkdirs(); new File(getCheckPointFile()).createNewFile(); } public static boolean validateFormats(String[] formats) { for (int i = 0; i < formats.length; i++) { if (formats[i].equalsIgnoreCase("numeric") || formats[i].equalsIgnoreCase("string") || formats[i].equalsIgnoreCase("bigint")) continue; else if (isTimeStampFormat(formats[i])) { try { String tmp = DateTimeFormat.forPattern(getTimeStampFormat(formats[i])).print(System.currentTimeMillis()); } catch (IllegalArgumentException e) { logger.error("Validating the time-format(" + formats[i] + ") used by the CSV-wrapper failed. "); return false; } } else { logger.error("The format (" + formats[i] + ") used by the CSV-Wrapper doesn't exist."); return false; } } return true; } /** * Removes the space from the fields. * Split the rawFields using comma as the separator. * * @param rawFields * @param toLowerCase, if false, the case is preserved. if true, the actual outputs will be in lower-case. * @return * @throws IOException */ public static String[] generateFieldIdx(String rawFields, boolean toLowerCase) throws IOException { String[] toReturn = new CSVReader(new StringReader(rawFields)).readNext(); if (toReturn == null) return new String[0]; for (int i = 0; i < toReturn.length; i++) { toReturn[i] = toReturn[i].trim(); if (toLowerCase) toReturn[i] = toReturn[i].toLowerCase(); } return toReturn; } public ArrayList<TreeMap<String, Serializable>> work(Reader dataFile, String checkpointDir) throws IOException { ArrayList<TreeMap<String, Serializable>> items = null; setupCheckPointFileIfNeeded(); String val = FileUtils.readFileToString(new File(checkPointFile), "UTF-8"); long lastItem = 0; if (val != null && val.trim().length() > 0) lastItem = Long.parseLong(val.trim()); items = parseValues(dataFile, lastItem); return items; } public void updateCheckPointFile(long timestamp) throws IOException { FileUtils.writeStringToFile(new File(checkPointFile), Long.toString(timestamp), "UTF-8"); } private boolean loggedNoChange = false; // to avoid duplicate logging messages when there is no change public ArrayList<TreeMap<String, Serializable>> parseValues(Reader datainput, long previousCheckPoint) throws IOException { ArrayList<TreeMap<String, Serializable>> toReturn = new ArrayList<TreeMap<String, Serializable>>(); CSVReader reader = new CSVReader(datainput, getSeparator(), getStringSeparator(), getSkipFirstXLines()); String[] values = null; long currentLine = 0; while ((values = reader.readNext()) != null) { TreeMap<String, Serializable> se = convertTo(formats, fields, getNulls(), values, getSeparator()); if (isEmpty(se)) continue; if (se.containsKey(TIMESTAMP)) { //System.out.println("times "+se.get(TIMESTAMP)+"--"+previousCheckPoint); if (((Long) se.get(TIMESTAMP)) <= previousCheckPoint) continue; } else {// assuming useCounterForCheckPoint = true if (logger.isDebugEnabled()) { String symbol = (currentLine < previousCheckPoint) ? " < " : " >= "; logger.debug("currentLine=" + currentLine + symbol + "checkpoint=" + previousCheckPoint); } if (currentLine < previousCheckPoint) {// skipping already read lines, based on line count logger.debug("skipping"); currentLine++; continue; } } toReturn.add(se); currentLine++; loggedNoChange = false; if (toReturn.size() > 250) break; // Move outside the loop as in each call we only read 250 values; } if (logger.isDebugEnabled() && toReturn.size() == 0 && loggedNoChange == false) { logger.debug("There is no new item after most recent checkpoint(previousCheckPoint:" + new DateTime(previousCheckPoint) + ")."); loggedNoChange = true; } reader.close(); return toReturn; } private boolean isEmpty(Map<String, Serializable> se) { for (Object o : se.values()) if (o != null) return false; return true; } public TreeMap<String, Serializable> convertTo(String[] formats, String[] fields, String nullValues[], String[] values, char separator) { TreeMap<String, Serializable> streamElement = new TreeMap<String, Serializable>(new CaseInsensitiveComparator()); for (String field : fields) streamElement.put(field, null); HashMap<String, String> timeStampFormats = new HashMap<String, String>(); for (int i = 0; i < Math.min(fields.length, values.length); i++) { if (isNull(nullValues, values[i])) { continue; } else if (formats[i].equalsIgnoreCase("numeric")) { try { streamElement.put(fields[i], Double.parseDouble(values[i])); } catch (java.lang.NumberFormatException e) { logger.error("Parsing to Numeric failed: Value to parse=" + values[i]+ " in"+getDataFile()); throw e; } } else if (formats[i].equalsIgnoreCase("string")) { streamElement.put(fields[i], values[i]); } else if (formats[i].equalsIgnoreCase("bigint")) { try { streamElement.put(fields[i], Long.parseLong(values[i])); } catch (java.lang.NumberFormatException e) { logger.error("Parsing to BigInt failed: Value to parse=" + values[i]+ " in"+getDataFile()); throw e; } } else if (isTimeStampFormat(formats[i])) { String value = ""; String format = ""; if (streamElement.get(fields[i]) != null) { value = (String) streamElement.get(fields[i]); format = timeStampFormats.get(fields[i]); value += separator; format += separator; } if (isTimeStampLeftPaddedFormat(formats[i])) values[i] = StringUtils.leftPad(values[i], getTimeStampFormat(formats[i]).length(), '0'); value += values[i]; format += getTimeStampFormat(formats[i]); streamElement.put(fields[i], value); timeStampFormats.put(fields[i], format); } } for (String timeField : timeStampFormats.keySet()) { String timeFormat = timeStampFormats.get(timeField); String timeValue = (String) streamElement.get(timeField); try { DateTime x = DateTimeFormat.forPattern(timeFormat).withZone(getTimeZone()).parseDateTime(timeValue); streamElement.put(timeField, x.getMillis()); } catch (IllegalArgumentException e) { logger.error("Parsing error: TimeFormat=" + timeFormat + " , TimeValue=" + timeValue+ " in"+getDataFile()); logger.error(e.getMessage(), e); throw e; } } return streamElement; } public static String getTimeStampFormat(String input) { if (input.indexOf("timestampl(") >= 0) return input.substring("timestampl(".length(), input.indexOf(")")).trim(); else return input.substring("timestamp(".length(), input.indexOf(")")).trim(); } public static boolean isTimeStampFormat(String input) { return (input.toLowerCase().startsWith("timestamp(") || input.toLowerCase().startsWith("timestampl(")) && input.endsWith(")"); } public static boolean isTimeStampLeftPaddedFormat(String input) { return input.toLowerCase().startsWith("timestampl(") && input.endsWith(")"); } public char getSeparator() { return separator; } public char getStringSeparator() { return stringSeparator; } public int getSkipFirstXLines() { return skipFirstXLines; } public static boolean isNull(String[] possibleNullValues, String value) { if (value == null || value.length() == 0) return true; for (int i = 0; i < possibleNullValues.length; i++) if (possibleNullValues[i].equalsIgnoreCase(value.trim())) return true; return false; } public String[] getFields() { return fields; } public DataField[] getDataFields() { HashMap<String, String> fields = new HashMap<String, String>(); for (int i = 0; i < getFields().length; i++) { String field = getFields()[i]; String type = getFormats()[i]; if (isTimeStampFormat(type)) { //GSN doesn't support timestamp data type, all timestamp values are supposed to be bigint. fields.put(field, "bigint"); } else if (type.equalsIgnoreCase("numeric")) fields.put(field, "numeric"); else if (type.equalsIgnoreCase("bigint")) fields.put(field, "bigint"); else fields.put(field, "string"); } DataField[] toReturn = new DataField[fields.size()]; int i = 0; for (String key : fields.keySet()) toReturn[i++] = new DataField(key, fields.get(key)); return toReturn; } public String[] getFormats() { return formats; } public String getDataFile() { return dataFile; } public String[] getNulls() { return nulls; } public void setSkipFirstXLines(int skipFirstXLines) { this.skipFirstXLines = skipFirstXLines; } public DateTimeZone getTimeZone() { return timeZone; } public String getCheckPointFile() { return checkPointFile; } }