package water.parser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.io.orc.OrcFile; import org.apache.hadoop.hive.ql.io.orc.Reader; import org.apache.hadoop.hive.ql.io.orc.RecordReader; import org.apache.hadoop.hive.ql.io.orc.StripeInformation; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.joda.time.DateTime; import org.junit.Ignore; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Set; import water.fvec.Frame; import water.fvec.Vec; import water.util.Log; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static water.parser.orc.OrcUtil.isSupportedSchema; /** * ORC testing support methods. * * Note: Separate ORC-specific logic from tests. * This is necessary to avoid classloading of ORC-classes during loading ORC tests. */ @Ignore("Support for ORC tests, but no actual tests here") public class OrcTestUtils { static private double EPSILON = 1e-9; static private long ERRORMARGIN = 1000L; // error margin when compare timestamp. static final int DAY_TO_MS = 24*3600*1000; static final int ADD_OFFSET = 8*3600*1000; static final int HOUR_OFFSET = 3600000; // in ms to offset for leap seconds, years static int compareOrcAndH2OFrame(String fileName, File f, Set<String> failedFiles) throws IOException { Frame h2oFrame = null; try { Configuration conf = new Configuration(); Path p = new Path(f.toString()); Reader orcFileReader = OrcFile.createReader(p, OrcFile.readerOptions(conf)); h2oFrame = water.TestUtil.parse_test_file(f.toString()); return compareH2OFrame(fileName, failedFiles, h2oFrame, orcFileReader); } finally { if (h2oFrame != null) h2oFrame.delete(); } } /** * This method will take one H2O frame generated by the Orc parser and the fileName of the Orc file * and attempt to compare the content of the Orc file to the H2O frame. In particular, the following * are compared: * - column names; * - number of columns and rows; * - content of each row. * * If all comparison pass, the test will pass. Otherwise, the test will fail. * * @param h2oFrame * @param orcReader */ static int compareH2OFrame(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader) { // grab column names, column and row numbers StructObjectInspector insp = (StructObjectInspector) orcReader.getObjectInspector(); List<StructField> allColInfo = (List<StructField>) insp.getAllStructFieldRefs(); // get info of all cols // compare number of columns and rows int allColNumber = allColInfo.size(); // get and check column number boolean[] toInclude = new boolean[allColNumber+1]; int colNumber = 0 ; int index1 = 0; for (StructField oneField:allColInfo) { String colType = oneField.getFieldObjectInspector().getTypeName(); if (colType.toLowerCase().contains("decimal")) colType = "decimal"; if (isSupportedSchema(colType)) { toInclude[index1 + 1] = true; colNumber++; } index1++; } assertEquals("Number of columns need to be the same: ", colNumber, h2oFrame.numCols()); // compare column names String[] colNames = new String[colNumber]; String[] colTypes = new String[colNumber]; int colIndex = 0; for (int index = 0; index < allColNumber; index++) { // get and check column names String typeName = allColInfo.get(index).getFieldObjectInspector().getTypeName(); if (typeName.toLowerCase().contains("decimal")) typeName = "decimal"; if (isSupportedSchema(typeName)) { colNames[colIndex] = allColInfo.get(index).getFieldName(); colTypes[colIndex] = typeName; colIndex++; } } assertArrayEquals("Column names need to be the same: ", colNames, h2oFrame._names); // compare one column at a time of the whole row? int failed = compareFrameContents(fileName, failedFiles, h2oFrame, orcReader, colTypes, colNames, toInclude); Long totalRowNumber = orcReader.getNumberOfRows(); // get and check row number assertEquals("Number of rows need to be the same: ", totalRowNumber, (Long) h2oFrame.numRows()); return failed; } static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) { List<StripeInformation> stripesInfo = orcReader.getStripes(); // get all stripe info int wrongTests = 0; if (stripesInfo.size() == 0) { // Orc file contains no data assertEquals("Orc file is empty. H2O frame row number should be zero: ", 0, h2oFrame.numRows()); } else { Long startRowIndex = 0L; // row index into H2O frame for (StripeInformation oneStripe : stripesInfo) { try { RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames); VectorizedRowBatch batch = perStripe.nextBatch(null); // read orc file stripes in vectorizedRowBatch boolean done = false; Long rowCounts = 0L; Long rowNumber = oneStripe.getNumberOfRows(); // row number of current stripe while (!done) { long currentBatchRow = batch.count(); // row number of current batch ColumnVector[] dataVectors = batch.cols; int colIndex = 0; for (int cIdx = 0; cIdx < batch.numCols; cIdx++) { // read one column at a time; if (toInclude[cIdx+1]) { compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex); colIndex++; } } rowCounts = rowCounts + currentBatchRow; // record number of rows of data actually read startRowIndex = startRowIndex + currentBatchRow; if (rowCounts >= rowNumber) // read all rows of the stripe already. done = true; if (!done) // not done yet, get next batch batch = perStripe.nextBatch(batch); } perStripe.close(); } catch (Throwable e) { failedFiles.add(fileName); e.printStackTrace(); wrongTests += 1; } } } return wrongTests; } static void compare1Cloumn(ColumnVector oneColumn, String columnType, int cIdx, long currentBatchRow, Vec h2oColumn, Long startRowIndex) { // if (columnType.contains("bigint")) // cannot handle big integer right now // return; if (columnType.contains("binary")) // binary retrieval problem. Tomas return; switch (columnType) { case "boolean": case "bigint": // FIXME: not working right now case "int": case "smallint": case "tinyint": CompareLongcolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); break; case "float": case "double": compareDoublecolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); break; case "string": //FIXME: not working right now case "varchar": case "char": case "binary": //FIXME: only reading it as string right now. compareStringcolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex, columnType); break; case "timestamp": case "date": compareTimecolumn(oneColumn, columnType, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); break; case "decimal": compareDecimalcolumn(oneColumn, oneColumn.isNull, currentBatchRow, h2oColumn, startRowIndex); break; default: Log.warn("String, bigint are not tested. H2O frame is built for them but cannot be verified."); } } static void compareDecimalcolumn(ColumnVector oneDecimalColumn, boolean[] isNull, long currentBatchRow, Vec h2oFrame, Long startRowIndex) { HiveDecimalWritable[] oneColumn= ((DecimalColumnVector) oneDecimalColumn).vector; long frameRowIndex = startRowIndex; for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { if (isNull[rowIndex]) assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); else assertEquals("Decimal elements should equal: ", Double.parseDouble(oneColumn[rowIndex].toString()), h2oFrame.at(frameRowIndex), EPSILON); frameRowIndex++; } } static void compareTimecolumn(ColumnVector oneTSColumn, String columnType, boolean[] isNull, long currentBatchRow, Vec h2oFrame, Long startRowIndex) { long[] oneColumn = ((LongColumnVector) oneTSColumn).vector; long frameRowIndex = startRowIndex; for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { if (isNull[rowIndex]) assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); else { if (columnType.contains("timestamp")) assertEquals("Numerical elements should equal: ", oneColumn[rowIndex]/1000000, h2oFrame.at8(frameRowIndex), ERRORMARGIN); else assertEquals("Numerical elements should equal: ", correctTimeStamp(oneColumn[rowIndex]), h2oFrame.at8(frameRowIndex), ERRORMARGIN); } frameRowIndex++; } } static void compareStringcolumn(ColumnVector oneStringColumn, boolean[] isNull, long currentBatchRow, Vec h2oFrame, Long startRowIndex, String columnType) { byte[][] oneColumn = ((BytesColumnVector) oneStringColumn).vector; int[] stringLength = ((BytesColumnVector) oneStringColumn).length; int[] stringStart = ((BytesColumnVector) oneStringColumn).start; long frameRowIndex = startRowIndex; BufferedString tempH2o = new BufferedString(); BufferedString tempOrc = new BufferedString(); for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { if (isNull[rowIndex]) assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); else { if (!oneStringColumn.isRepeating || rowIndex == 0) tempOrc.set(oneColumn[rowIndex], stringStart[rowIndex], stringLength[rowIndex]); h2oFrame.atStr(tempH2o, frameRowIndex); assertEquals("isRepeating = " + oneStringColumn.isRepeating + " String/char elements should equal: ", true, tempOrc.equals(tempH2o)); } frameRowIndex++; } } static void compareDoublecolumn(ColumnVector oneDoubleColumn, boolean[] isNull, long currentBatchRow, Vec h2oFrame, Long startRowIndex) { double[] oneColumn= ((DoubleColumnVector) oneDoubleColumn).vector; long frameRowIndex = startRowIndex; for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { if (isNull[rowIndex]) assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); else assertEquals("Numerical elements should equal: ", oneColumn[rowIndex], h2oFrame.at(frameRowIndex), EPSILON); frameRowIndex++; } } static void CompareLongcolumn(ColumnVector oneLongColumn, boolean[] isNull, long currentBatchRow, Vec h2oFrame, Long startRowIndex) { long[] oneColumn= ((LongColumnVector) oneLongColumn).vector; long frameRowIndex = startRowIndex; for (int rowIndex = 0; rowIndex < currentBatchRow; rowIndex++) { if (isNull[rowIndex]) assertEquals("Na is found: ", true, h2oFrame.isNA(frameRowIndex)); else { if (h2oFrame.isNA(frameRowIndex)) continue; else assertEquals("Numerical elements should equal: ", oneColumn[rowIndex], h2oFrame.at8(frameRowIndex)); } frameRowIndex++; } } static long correctTimeStamp(long daysSinceEpoch) { long timestamp = (daysSinceEpoch*DAY_TO_MS+ADD_OFFSET); DateTime date = new DateTime(timestamp); int hour = date.hourOfDay().get(); if (hour == 0) return timestamp; else return (timestamp-hour*HOUR_OFFSET); } }