/* --------------------------------------------------------------------- * Numenta Platform for Intelligent Computing (NuPIC) * Copyright (C) 2014, Numenta, Inc. Unless you have an agreement * with Numenta, Inc., for a separate license for this software code, the * following terms and conditions apply: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero Public License version 3 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Affero Public License for more details. * * You should have received a copy of the GNU Affero Public License * along with this program. If not, see http://www.gnu.org/licenses. * * http://numenta.org/licenses/ * --------------------------------------------------------------------- */ package org.numenta.nupic.network.sensor; import gnu.trove.map.TObjectIntMap; import gnu.trove.map.hash.TObjectIntHashMap; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.numenta.nupic.FieldMetaType; import org.numenta.nupic.datagen.ResourceLocator; import org.numenta.nupic.encoders.MultiEncoder; /** * Simple abstraction to hold the contents of a list of csv records indicated by * a specified path. There are methods to retrieve the header and the * body separately. The header size is assumed to be 3. * * Additionally, there is a specialized iterator which returns the * {@link MultiEncoder}'s required Map of input field names to objects. * Instead of hashing a new entry for every row of the input file or stream * we simply access this class' internal array by overriding {@link Map#get(Object)} * and {@link Iterator#next()} and accessing this class' internal data structures - * thus adhering to the Map and Iterator interface without the overhead of mappings * and hashing. * * @author David Ray * */ public class CSVSource implements MetaSource { private List<String[]> header; private List<String[]> body; private List<List<String[]>> file; private TObjectIntMap<String> fieldIndexMap = new TObjectIntHashMap<>(); private FieldMetaType[] fieldTypes; private String datePattern; private DateTimeFormatter format; public static final int HEADER_SIZE = 3; public static final int HEADER_IDX = 0; public static final int BODY_IDX = 1; /** * Constructs a new CSVFile object using the specified file name. * The file indicated is expected to be on the application's classpath. * * @param fileName */ public CSVSource(String fileName) { this(fileName, null); } /** * Constructs a new CSVFile object using the specified file name. * The file indicated is expected to be on the application's classpath. * Additionally, this constructor prepares a {@link DateTimeFormatter} * using the specified format pattern, which must be set if the encoding * is to encode a DateTime field. * * * @param fileName * @param datePattern */ public CSVSource(String fileName, String datePattern) { String s = ResourceLocator.path(fileName); this.file = createSource(new File(s)); this.header = file.get(HEADER_IDX); this.body = file.get(BODY_IDX); this.datePattern = datePattern; this.format = DateTimeFormat.forPattern(this.datePattern); // Convenience mapping for fast field lookup for(int i = 0;i < this.header.get(0).length;i++) { fieldIndexMap.put(this.header.get(0)[i], i); } fieldTypes = new FieldMetaType[this.header.get(0).length]; for(int i = 0;i < fieldTypes.length;i++) { fieldTypes[i] = FieldMetaType.fromString(this.header.get(1)[i]); } } /** * Returns a List of string array lists of size 2. * The zero'th index being the header (of size 3), * and the first index of size = FILE_SIZE. * * @param f * @return */ public List<List<String[]>> createSource(File f) { List<String[]> body = new ArrayList<>(); List<String[]> header = new ArrayList<>(); List<List<String[]>> file = new ArrayList<>(); BufferedReader br = null; try { br = new BufferedReader(new FileReader(f)); String line = null; int headerIdx = 0; while((line = br.readLine()) != null) { if(headerIdx++ < HEADER_SIZE) { header.add(line.split("[\\s]*\\,[\\s]*")); }else{ body.add(line.split("[\\s]*\\,[\\s]*")); } } } catch(Exception e) { e.printStackTrace(); } finally{ try { br.close(); }catch(Exception ignore){} } file.add(header); file.add(body); return file; } /** * Returns the List of string arrays comprising the header * @return */ public List<String[]> getHeader() { return header; } /** * Returns the List of string arrays comprising the body * @return */ public List<String[]> getBody() { return body; } /** * Returns specialized iterator which avoids resetting map entries, thus * supporting very fast iteration and resource savings without the need to * rehash every single row. */ @Override public Iterator<Map<String, Object>> multiIterator() { return new Iterator<Map<String, Object>>() { int idx = -1; int size = body.size(); @SuppressWarnings("serial") Map<String, Object> map = new HashMap<String, Object>() { /** * Overridden to access this class' internal array instead of having * to rehash map entries on every record access and store. */ @Override public Object get(Object name) { int typeIndex = fieldIndexMap.get(name); // Return Date Time type if(fieldTypes[typeIndex] == FieldMetaType.DATETIME) { if(format == null) { throw new IllegalStateException( "DateField requires pattern configuration on construction."); } return format.parseDateTime(body.get(idx)[typeIndex]); }else if(fieldTypes[typeIndex] == FieldMetaType.FLOAT || fieldTypes[typeIndex] == FieldMetaType.INTEGER) { // Return any numeric type return Double.parseDouble(body.get(idx)[fieldIndexMap.get(name)]); } // Return String type (i.e. category) return body.get(idx)[fieldIndexMap.get(name)]; } }; @Override public boolean hasNext() { return idx < size - 1; } @Override public Map<String, Object> next() { idx++; return map; } @Override public void remove() { throw new UnsupportedOperationException("Remove not supported"); } }; } }