package hex.genmodel.tools; import hex.genmodel.GenMunger; import hex.genmodel.easy.RowData; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; /** * Simple driver program for reading a CSV file and munging it. * * This driver program is used as a test harness by several tests in the testdir_javamunge directory. * <p></p> * See the top-of-tree master version of this file <a href="https://github.com/h2oai/h2o-3/blob/master/h2o-genmodel/src/main/java/hex/genmodel/tools/MungeCsv.java" target="_blank">here on github</a>. */ public class MungeCsv { private static String assemblyClassName; private static String inputCSVFileName; private static String outputCSVFileName; private static int haveHeaders = -1; private static void usage() { System.out.println(""); System.out.println("usage: java [...java args...] hex.genmodel.tools.MungeCsv --header --model modelClassName --input inputCSVFileName --output outputCSVFileName"); System.out.println(""); System.out.println(" assembly class name is something like AssemblyPojo_bleehbleehbleeh."); System.out.println(""); System.out.println(" inputCSVFileName is the test data set."); System.out.println(" Specifying --header is required for h2o-3."); System.out.println(""); System.out.println(" outputCSVFileName is the munged data set (one row per data set row)."); System.out.println(""); System.exit(1); } private static void parseArgs(String[] args) { for (int i = 0; i < args.length; i++) { String s = args[i]; switch( s ) { case "--munger": i++; if (i >= args.length) usage(); assemblyClassName = args[i]; break; case "--input": i++; if (i >= args.length) usage(); inputCSVFileName = args[i]; break; case "--output": i++; if (i >= args.length) usage(); outputCSVFileName = args[i]; break; case "--header": haveHeaders = 1; break; default: // skip System.out.println("bad param... skipping."); } } if (haveHeaders != 1) { System.out.println("ERROR: header not specified"); usage(); } if (assemblyClassName == null) { System.out.println("ERROR: model not specified"); usage(); } if (inputCSVFileName == null) { System.out.println("ERROR: input not specified"); usage(); } if (outputCSVFileName == null) { System.out.println("ERROR: output not specified"); usage(); } } /** * This CSV parser is as bare bones as it gets. * Our test data doesn't have funny quoting, spacing, or other issues. * Can't handle cases where the number of data columns is less than the number of header columns. */ private static RowData parseDataRow(String line, GenMunger munger) { if( line.isEmpty() || line.equals("") ) return null; String[] inputData = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)|(,)", -1); for(int i=0;i<inputData.length;++i) inputData[i]=inputData[i]==null?"":inputData[i]; if( inputData.length != munger.inNames().length ) return null; return munger.fillDefault(inputData); } /** * CSV reader and predictor test program. * * @param args Command-line args. * @throws Exception */ public static void main(String[] args) throws Exception { parseArgs(args); GenMunger rawMunger; rawMunger = (hex.genmodel.GenMunger) Class.forName(assemblyClassName).newInstance(); BufferedReader input = new BufferedReader(new FileReader(inputCSVFileName)); BufferedWriter output = new BufferedWriter(new FileWriter(outputCSVFileName)); // Emit outputCSV column names. String[] rawHeader = rawMunger.outNames(); StringBuilder header = new StringBuilder(); for(int i=0;i<rawHeader.length;++i) { header.append("\"").append(rawHeader[i]).append("\""); if( i < rawHeader.length - 1 ) header.append(","); } output.write(header.toString()); output.write("\n"); // Loop over inputCSV one row at a time. int lineNum = 0; String line; try { while ((line = input.readLine()) != null) { lineNum++; // skip the header. if (lineNum == 1) continue; // Parse the CSV line. Somewhat handles quoted commas. But this ain't no parser test! RowData row; try { row = parseDataRow(line, rawMunger); } catch( NumberFormatException nfe) { nfe.printStackTrace(); System.out.println("Failed to parse row: " + lineNum ); throw new RuntimeException(); } RowData mungedRow = rawMunger.fit(row); for(int i=0; i<rawMunger.outNames().length;++i) { Object val = mungedRow==null?Double.NaN:mungedRow.get(rawMunger.outNames()[i]); if( val instanceof Double ) output.write(String.valueOf(val)); else output.write("\"" + val + "\""); if( i < rawMunger.outNames().length - 1) output.write(","); } output.write("\n"); } } catch (Exception e) { System.out.println("Caught exception on line " + lineNum); System.out.println(""); e.printStackTrace(); System.exit(1); } finally { // Clean up. output.close(); input.close(); } // Predictions were successfully generated. Calling program can now compare them with something. System.exit(0); } }