package water.parser; import java.util.ArrayList; import water.Key; import water.fvec.Vec; import static water.parser.DefaultParserProviders.ARFF_INFO; class ARFFParser extends CsvParser { private static final String TAG_ATTRIBUTE = "@ATTRIBUTE"; private static final byte GUESS_SEP = ParseSetup.GUESS_SEP; ARFFParser(ParseSetup ps, Key jobKey) { super(ps, jobKey); } /** Try to parse the bytes as ARFF format */ static ParseSetup guessSetup(byte[] bits, byte sep, boolean singleQuotes, String[] columnNames, String[][] naStrings) { if (columnNames != null) throw new UnsupportedOperationException("ARFFParser doesn't accept columnNames."); // Parse all lines starting with @ until EOF or @DATA boolean haveData = false; int offset = 0; String[][] data = new String[0][];; String[] labels; String[][] domains; String[] headerlines = new String[0]; byte[] ctypes; // header section ArrayList<String> header = new ArrayList<>(); offset = readArffHeader(offset, header, bits, singleQuotes); if (offset < bits.length && !CsvParser.isEOL(bits[offset])) haveData = true; //more than just the header if (header.size() == 0) throw new ParseDataset.H2OParseException("No data!"); headerlines = header.toArray(headerlines); // process header final int nlines = headerlines.length; int ncols = nlines; labels = new String[ncols]; domains = new String[ncols][]; ctypes = new byte[ncols]; processArffHeader(ncols, headerlines, labels, domains, ctypes); // data section (for preview) if (haveData) { String[] datalines = new String[0]; ArrayList<String> datablock = new ArrayList<>(); while (offset < bits.length) { int lineStart = offset; while (offset < bits.length && !CsvParser.isEOL(bits[offset])) ++offset; int lineEnd = offset; ++offset; // For Windoze, skip a trailing LF after CR if ((offset < bits.length) && (bits[offset] == CsvParser.CHAR_LF)) ++offset; if (bits[lineStart] == '#') continue; // Ignore comment lines if (bits[lineStart] == '%') continue; // Ignore ARFF comment lines if (lineEnd > lineStart) { String str = new String(bits, lineStart, lineEnd - lineStart).trim(); if (!str.isEmpty()) datablock.add(str); } } if (datablock.size() == 0) throw new ParseDataset.H2OParseException("Unexpected line."); datalines = datablock.toArray(datalines); // process data section int nlines2 = Math.min(10, datalines.length); data = new String[nlines2][]; // First guess the field separator by counting occurrences in first few lines if (nlines2 == 1) { if (sep == GUESS_SEP) { if (datalines[0].split(",").length > 2) sep = (byte) ','; else if (datalines[0].split(" ").length > 2) sep = ' '; else throw new ParseDataset.H2OParseException("Failed to detect separator."); } data[0] = determineTokens(datalines[0], sep, singleQuotes); ncols = (ncols > 0) ? ncols : data[0].length; labels = null; } else { // 2 or more lines if (sep == GUESS_SEP) { // first guess the separator sep = guessSeparator(datalines[0], datalines[1], singleQuotes); if (sep == GUESS_SEP && nlines2 > 2) { sep = guessSeparator(datalines[1], datalines[2], singleQuotes); if (sep == GUESS_SEP) sep = guessSeparator(datalines[0], datalines[2], singleQuotes); } if (sep == GUESS_SEP) sep = (byte) ' '; // Bail out, go for space } for (int i = 0; i < nlines2; ++i) { data[i] = determineTokens(datalines[i], sep, singleQuotes); } } } // Return the final setup return new ParseSetup(ARFF_INFO, sep, singleQuotes, ParseSetup.NO_HEADER, ncols, labels, ctypes, domains, naStrings, data); } private static int readArffHeader(int offset, ArrayList<String> header, byte[] bits, boolean singleQuotes) { while (offset < bits.length) { int lineStart = offset; while (offset < bits.length && !CsvParser.isEOL(bits[offset])) ++offset; int lineEnd = offset; ++offset; // For Windoze, skip a trailing LF after CR if ((offset < bits.length) && (bits[offset] == CsvParser.CHAR_LF)) ++offset; if (bits[lineStart] == '#') continue; // Ignore comment lines if (bits[lineStart] == '%') continue; // Ignore ARFF comment lines if (lineEnd > lineStart) { if (bits[lineStart] == '@' && (bits[lineStart+1] == 'D' || bits[lineStart+1] =='d' ) && (bits[lineStart+2] == 'A' || bits[lineStart+2] =='a' ) && (bits[lineStart+3] == 'T' || bits[lineStart+3] =='t' ) && (bits[lineStart+4] == 'A' || bits[lineStart+4] =='a' )){ break; } String str = new String(bits, lineStart, lineEnd - lineStart).trim(); String[] tok = determineTokens(str, CHAR_SPACE, singleQuotes); if (tok.length > 0 && tok[0].equalsIgnoreCase("@RELATION")) continue; // Ignore name of dataset if (!str.isEmpty()) header.add(str); } } return offset; } static void processArffHeader(int ncols, String[] headerlines, String[] labels, String[][] domains, byte[] ctypes) { for (int i=0; i<ncols; ++i) { String[] line = headerlines[i].split("\\s+", 2); if (!line[0].equalsIgnoreCase(TAG_ATTRIBUTE)) { throw new ParseDataset.H2OParseException("Expected line to start with @ATTRIBUTE."); } else { final String spec = (line.length == 2) ? line[1].replaceAll("\\s", " ") : ""; // normalize separators int sepIdx = spec.lastIndexOf(' '); if (sepIdx < 0) { throw new ParseDataset.H2OParseException("Expected @ATTRIBUTE to be followed by <attribute-name> <datatype>"); } final String type = spec.substring(sepIdx + 1).trim(); domains[i] = null; ctypes[i] = Vec.T_BAD; if (type.equalsIgnoreCase("NUMERIC") || type.equalsIgnoreCase("REAL") || type.equalsIgnoreCase("INTEGER") || type.equalsIgnoreCase("INT")) { ctypes[i] = Vec.T_NUM; } else if (type.equalsIgnoreCase("DATE") || type.equalsIgnoreCase("TIME")) { ctypes[i] = Vec.T_TIME; } else if (type.equalsIgnoreCase("ENUM")) { ctypes[i] = Vec.T_CAT; } else if (type.equalsIgnoreCase("STRING")) { ctypes[i] = Vec.T_STR; } else if (type.equalsIgnoreCase("UUID")) { //extension of ARFF ctypes[i] = Vec.T_UUID; } else if (type.equalsIgnoreCase("RELATIONAL")) { throw new UnsupportedOperationException("Relational ARFF format is not supported."); } else if (type.endsWith("}")) { int domainSpecStart = spec.lastIndexOf('{'); if (domainSpecStart < 0) throw new ParseDataset.H2OParseException("Invalid type specification."); sepIdx = domainSpecStart - 1; String domainSpec = spec.substring(domainSpecStart + 1, line[1].length() - 1); domains[i] = domainSpec.split(","); for (int j = 0; j < domains[i].length; j++) domains[i][j] = domains[i][j].trim(); if (domains[i][0].length() > 0) ctypes[i] = Vec.T_CAT; // case of {A,B,C} (valid list of factors) } if (ctypes[i] == Vec.T_BAD) throw new ParseDataset.H2OParseException("Unexpected line, type not recognized. Attribute specification: " + type); // remove the whitespaces separating the label and the type specification while ((sepIdx > 0) && (spec.charAt(sepIdx - 1) == ' ')) sepIdx--; String label = line[1].substring(0, sepIdx); // use the raw string before whitespace normalization // remove quotes if (label.length() >= 2 && label.startsWith("'") && label.endsWith("'")) label = label.substring(1, label.length() - 1); labels[i] = label; } } } }