package com.thinkbiganalytics.discovery.parsers.csv; /*- * #%L * thinkbig-schema-discovery-default * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.discovery.schema.Field; import com.thinkbiganalytics.discovery.schema.HiveTableSchema; import com.thinkbiganalytics.discovery.schema.Schema; import com.thinkbiganalytics.discovery.util.TableSchemaType; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.util.List; import java.util.stream.IntStream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; /** */ public class CSVFileSchemaParserTest { private CSVFileSchemaParser parser = new CSVFileSchemaParser(); private InputStream toInputStream(String text) { return new ByteArrayInputStream(text.getBytes()); } @org.junit.Test public void testEmbeddedCommas() throws Exception { validateSchema1("col1,col2,col3\n\"Edoceo, Inc.\",Seattle,WA\nfoo,bar,fee"); } @org.junit.Test public void testEmbeddedCommasNoAutodetect() throws Exception { parser.setAutoDetect(false); validateSchema1("col1,col2,col3\n\"Edoceo, Inc.\",Seattle,WA\nfoo,bar,fee"); } @org.junit.Test public void testDefaultCSVParse() throws Exception { validateSchema1("col1,col2,col3\nr1v1,r1v2,r1v3\nr2v1,r2v2,r2v3\n"); } @org.junit.Test public void testSemiColonCSVWithEscapeParse() throws Exception { parser.setSeparatorChar(";"); validateSchema1("col1;col2;col3\nr1v1;r1v2;r1v3\nr2v1;r2v2;r2v3\n"); } @org.junit.Test public void testSemiColonTSVWithEscapeParse() throws Exception { parser.setSeparatorChar("\t"); validateSchema1("col1\tcol2\tcol3\nr1v1\tr1v2\tr1v3\nr2v1\tr2v2\tr2v3\n"); } @org.junit.Test public void testNoHeader() throws Exception { parser.setHeaderRow(false); try (InputStream is = toInputStream("r1v1,r1v2,r1v3\nr2v1,r2v2,r2v3\n")) { HiveTableSchema schema = toHiveTableSchema(is); List<? extends Field> fields = schema.getFields(); assertTrue(fields.size() == 3); IntStream.range(0, fields.size()).forEach(idx -> { assertEquals(fields.get(idx).getName(), "Col_" + (idx + 1)); // Note: only loads one assertEquals(fields.get(idx).getSampleValues().size(), 1); }); } } @org.junit.Test public void testNoHeaderFirstRowDuplicateValues() throws Exception { parser.setHeaderRow(false); firstRowDuplicateValues(); } @org.junit.Test (expected=IllegalArgumentException.class) public void testHeaderFirstRowDuplicateValues() throws Exception { parser.setHeaderRow(true); firstRowDuplicateValues(); } private void firstRowDuplicateValues() throws Exception { try (InputStream is = toInputStream("r1v1,r1v1,r1v3\nr2v1,r2v2,r2v3\n")) { HiveTableSchema schema = toHiveTableSchema(is); List<? extends Field> fields = schema.getFields(); assertTrue(fields.size() == 3); IntStream.range(0, fields.size()).forEach(idx -> { assertEquals(fields.get(idx).getName(), "Col_" + (idx + 1)); assertEquals(fields.get(idx).getSampleValues().size(), 1); }); } } @org.junit.Test public void testNoHeaderFirstRowNoDuplicateValues() throws Exception { parser.setHeaderRow(false); try (InputStream is = toInputStream("HEAD_1,HEAD_2,HEAD_3\nr2v1,r2v2,r2v3\n")) { HiveTableSchema schema = toHiveTableSchema(is); List<? extends Field> fields = schema.getFields(); assertTrue(fields.size() == 3); IntStream.range(0, fields.size()).forEach(idx -> { assertEquals(fields.get(idx).getName(), "Col_" + (idx + 1)); assertEquals(fields.get(idx).getSampleValues().size(), 1); }); } } @org.junit.Test public void testHeaderFirstRowNoDuplicateValues() throws Exception { parser.setHeaderRow(true); try (InputStream is = toInputStream("HEAD_1,HEAD_2,HEAD_3\nr2v1,r2v2,r2v3\n")) { HiveTableSchema schema = toHiveTableSchema(is); List<? extends Field> fields = schema.getFields(); assertTrue(fields.size() == 3); IntStream.range(0, fields.size()).forEach(idx -> { assertEquals(fields.get(idx).getName(), "HEAD_" + (idx + 1)); assertEquals(fields.get(idx).getSampleValues().size(), 1); }); } } @org.junit.Test public void testSparse() throws Exception { // Test extra columns parser.setSeparatorChar("\t"); try { validateSchema1("col1\tcol2\tcol3\nr1v1\tr1v2\tr1v3\tr1v4\nr2v1\tr2v2\tr2v3\n"); fail("Expecting unrecognized format"); } catch (IOException e) { checkInvalidFormatException(e); } } @org.junit.Test public void testCSVUnixFile() throws Exception { parser.setAutoDetect(true); validateSchema2("MOCK_DATA.csv_unix.txt"); assertTrue("Expecting csv delim", ",".equals(parser.getSeparatorChar())); } @org.junit.Test public void testCSVWinFile() throws Exception { parser.setAutoDetect(true); validateSchema2("MOCK_DATA.csv_win.txt"); } @org.junit.Test public void testTABUnixFile() throws Exception { parser.setAutoDetect(true); validateSchema2("MOCK_DATA.tab_unix.txt"); assertTrue("Expecting tab delim", "\t".equals(parser.getSeparatorChar())); } @org.junit.Test public void testExcel() throws Exception { parser.setAutoDetect(true); validateSchema2("MOCK_DATA.csv_excel.txt"); } @org.junit.Test public void testCustom() throws Exception { parser.setAutoDetect(false); parser.setSeparatorChar("*"); validateSchema2("MOCK_DATA.custom.txt"); } @org.junit.Test public void testPipeDelim() throws Exception { parser.setAutoDetect(true); validateSchema2("MOCK_DATA.pipe.txt"); assertTrue("Expecting pipe delim", "|".equals(parser.getSeparatorChar())); } @org.junit.Test public void testSingleQuotedDelim() throws Exception { parser.setAutoDetect(true); validateSchema2("MOCK_DATA_csv_singlequote.txt"); assertTrue("Expecting comma delim", ",".equals(parser.getSeparatorChar())); assertTrue("Expecting single quote char", "'".equals(parser.getQuoteChar())); } @org.junit.Test public void testSingleQuoted() throws Exception { // Test single quoted string with embedded quote " parser.setAutoDetect(true); validateSchema1("col1,col2,col3\n'\"Edoceo, Inc.',Seattle,WA\nfoo,bar,fee"); assertTrue("Expecting comma delim", ",".equals(parser.getSeparatorChar())); assertTrue("Expecting single quote char", "'".equals(parser.getQuoteChar())); } @org.junit.Test public void testEmptyStream() throws Exception { parser.setAutoDetect(true); try { validateSchema2("missingfile.txt"); fail("Expecting error for missing file or empty stream"); } catch (NullPointerException e) { // ok } } @org.junit.Test public void testNoDelimFound() throws Exception { // Should return defaults and not error parser.setAutoDetect(true); try (InputStream is = CSVFileSchemaParserTest.class.getClassLoader().getResourceAsStream("junk.txt")) { try { HiveTableSchema schema = toHiveTableSchema(is); fail("Expecting unrecognized format"); } catch (IOException e) { checkInvalidFormatException(e); } } } @org.junit.Test public void testUTF16() throws Exception { try (InputStream is = CSVFileSchemaParserTest.class.getClassLoader().getResourceAsStream("MOCK_DATA_utf16_encoded.txt")) { parser.setAutoDetect(false); parser.setSeparatorChar(","); parser.setQuoteChar("\t"); HiveTableSchema schema = toHiveTableSchema(is, Charset.forName("UTF-16LE")); assertEquals("UTF-16LE", schema.getCharset()); List<? extends Field> fields = schema.getFields(); assertTrue(fields.size() == 4); IntStream.range(0, fields.size()).forEach(idx -> { assertEquals(fields.get(idx).getName(), "col" + (idx + 1)); assertEquals(fields.get(idx).getSampleValues().size(), 4); }); } } private void checkInvalidFormatException(IOException e) { assertTrue("Expecting unrecognized format exception", e.getLocalizedMessage().contains("Unrecognized format")); } private HiveTableSchema toHiveTableSchema(InputStream is) throws IOException { return toHiveTableSchema(is, Charset.defaultCharset()); } private HiveTableSchema toHiveTableSchema(InputStream is, Charset cs) throws IOException { Schema schema = parser.parse(is, cs, TableSchemaType.HIVE); assertTrue(schema != null); assertTrue(schema instanceof HiveTableSchema); return (HiveTableSchema) schema; } private HiveTableSchema validateSchema1(String text) throws IOException { try (InputStream is = toInputStream(text)) { HiveTableSchema schema = toHiveTableSchema(is); List<? extends Field> fields = schema.getFields(); assertTrue(fields.size() == 3); IntStream.range(0, fields.size()).forEach(idx -> { assertEquals(fields.get(idx).getName(), "col" + (idx + 1)); assertEquals(fields.get(idx).getSampleValues().size(), 2); }); //System.out.println(schema.getHiveFormat()); return schema; } } private HiveTableSchema validateSchema2(String filename) throws IOException { try (InputStream is = CSVFileSchemaParserTest.class.getClassLoader().getResourceAsStream(filename)) { HiveTableSchema schema = toHiveTableSchema(is); List<? extends Field> fields = schema.getFields(); assertTrue("Expecting 9 fields", fields.size() == 9); IntStream.range(0, fields.size()).forEach(idx -> { assertEquals("Expecting 9 samples values", 9, fields.get(idx).getSampleValues().size()); switch (idx) { case 0: assertEquals(fields.get(idx).getName(), "id"); break; case 1: assertEquals(fields.get(idx).getName(), "first_name"); break; case 2: assertEquals(fields.get(idx).getName(), "last name"); break; case 3: assertEquals(fields.get(idx).getName(), "url"); break; case 4: assertEquals(fields.get(idx).getName(), "gender"); break; case 5: assertEquals(fields.get(idx).getName(), "ip_address"); break; case 6: assertEquals(fields.get(idx).getName(), "timezone"); break; case 7: assertEquals(fields.get(idx).getName(), "desc"); break; case 8: assertEquals(fields.get(idx).getName(), "comment"); break; } }); return schema; } } }