package water.api;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import water.DKV;
import water.Key;
import water.api.schemas3.ParseSetupV3;
import water.exceptions.H2OIllegalArgumentException;
import water.parser.ParseDataset;
import water.parser.ParseSetup;
import water.util.DistributedException;
import water.util.PojoUtils;
import static water.parser.DefaultParserProviders.GUESS_INFO;
/** A class holding parser-setup flags: kind of parser, field separator, column
* header labels, whether or not to allow single-quotes to quote, number of
* columns discovered.
*/
public class ParseSetupHandler extends Handler {
public ParseSetupV3 guessSetup(int version, ParseSetupV3 p) {
if (p.source_frames == null)
throw new H2OIllegalArgumentException("No file names given for parsing.");
Key[] fkeys = new Key[p.source_frames.length];
for(int i=0; i < p.source_frames.length; i++) {
fkeys[i] = p.source_frames[i].key();
if (DKV.get(fkeys[i]) == null) throw new IllegalArgumentException("Key not loaded: "+ p.source_frames[i]);
}
// corrects for json putting in empty strings in the place of empty sub-arrays
if (p.na_strings != null)
for(int i = 0; i < p.na_strings.length; i++)
if (p.na_strings[i] != null && p.na_strings[i].length == 0) p.na_strings[i] = null;
ParseSetup ps;
try{
ps = ParseSetup.guessSetup(fkeys, new ParseSetup(p));
} catch(Throwable ex) {
Throwable ex2 = ex;
if(ex instanceof DistributedException)
ex2 = ex.getCause();
if(ex2 instanceof ParseDataset.H2OParseException)
throw new H2OIllegalArgumentException(ex2.getMessage());
throw ex;
}
if(ps._errs != null && ps._errs.length > 0) {
p.warnings = new String[ps._errs.length];
for (int i = 0; i < ps._errs.length; ++i)
p.warnings[i] = ps._errs[i].toString();
}
// TODO: ParseSetup throws away the srcs list. . .
if ((null == p.column_name_filter || "".equals(p.column_name_filter)) && (0 == p.column_offset) && (0 == p.column_count)) {
// return the entire data preview
PojoUtils.copyProperties(p, ps, PojoUtils.FieldNaming.ORIGIN_HAS_UNDERSCORES, new String[]{"destination_key", "source_keys", "column_types", "parse_type"});
p.total_filtered_column_count = p.number_columns;
} else {
// have to manually copy the desired parts of p.data to apply either column_name_filter or column pagination or both
PojoUtils.copyProperties(p, ps, PojoUtils.FieldNaming.ORIGIN_HAS_UNDERSCORES, new String[]{"destination_key", "source_keys", "column_types", "data", "parse_type"});
String[] all_col_names = ps.getColumnNames();
String[][] data = ps.getData();
ArrayList<Integer> keep_indexes = new ArrayList<>();
if (null != p.column_name_filter && ! "".equals(p.column_name_filter)) {
// filter and then paginate columns
Pattern pattern = Pattern.compile(p.column_name_filter);
Matcher m = pattern.matcher("dummy");
for (int column = 0; column < all_col_names.length; column++) {
m.reset(all_col_names[column]);
if (m.matches()) keep_indexes.add(column);
}
} else {
// paginate all columns
// note: we do a little extra work below by treating this like the filter case, but the code is simpler
for (int column = 0; column < all_col_names.length; column++) {
keep_indexes.add(column);
}
}
int width_to_return = Math.max(0, keep_indexes.size() - p.column_offset);
if (p.column_count > 0) width_to_return = Math.min(width_to_return, p.column_count);
String[][] filtered_data = new String[data.length][width_to_return];
for (int row = 0; row < data.length; row++) {
int output_column = 0;
for (int input_column_index = p.column_offset; input_column_index < p.column_offset + width_to_return; input_column_index++) {
// indirect through keep_indexes
filtered_data[row][output_column++] = data[row][keep_indexes.get(input_column_index)];
}
}
p.data = filtered_data;
p.total_filtered_column_count = keep_indexes.size();
}
p.destination_frame = ParseSetup.createHexName(p.source_frames[0].toString());
if( p.check_header==ParseSetup.HAS_HEADER && p.data != null && Arrays.equals(p.column_names, p.data[0])) p.data = Arrays.copyOfRange(p.data,1,p.data.length);
// Fill in data type names for each column.
p.column_types = ps.getColumnTypeStrings();
p.parse_type = ps.getParseType() != null ? ps.getParseType().name() : GUESS_INFO.name();
return p;
}
}