/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.repository.schema; import static org.apache.nifi.repository.schema.SchemaRecordWriter.MAX_ALLOWED_UTF_LENGTH; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.junit.Test; public class TestSchemaRecordReaderWriter { private static Character utfCharOneByte = '$'; private static Character utfCharTwoByte = '¢'; private static Character utfCharThreeByte = '€'; private static String utfStringOneByte = utfCharOneByte.toString(); private static String utfStringTwoByte = utfCharTwoByte.toString(); private static String utfStringThreeByte = utfCharThreeByte.toString(); @Test @SuppressWarnings("unchecked") public void testRoundTrip() throws IOException { // Create a 'complex' record that contains two different types of fields - a string and an int. final List<RecordField> complexFieldList1 = new ArrayList<>(); complexFieldList1.add(createField("string field", FieldType.STRING)); complexFieldList1.add(createField("int field", FieldType.INT)); final ComplexRecordField complexField1 = new ComplexRecordField("complex1", Repetition.EXACTLY_ONE, complexFieldList1); final Map<RecordField, Object> complexMap1 = new LinkedHashMap<>(); final RecordField stringField = createField("string field", FieldType.STRING); final RecordField intField = createField("int field", FieldType.INT); complexMap1.put(stringField, "apples"); complexMap1.put(intField, 100); final FieldMapRecord complexRecord1 = new FieldMapRecord(complexMap1, new RecordSchema(stringField, intField)); // Create another 'complex' record that contains two other types of fields - a long string and a long. final List<RecordField> complexFieldList2 = new ArrayList<>(); complexFieldList2.add(createField("long string field", FieldType.LONG_STRING)); complexFieldList2.add(createField("long field", FieldType.LONG)); final ComplexRecordField complexField2 = new ComplexRecordField("complex2", Repetition.EXACTLY_ONE, complexFieldList2); final Map<RecordField, Object> complexMap2 = new LinkedHashMap<>(); final RecordField longStringField = createField("long string field", FieldType.LONG_STRING); final RecordField longField = createField("long field", FieldType.LONG); complexMap2.put(longStringField, "oranges"); complexMap2.put(longField, Long.MAX_VALUE); final FieldMapRecord complexRecord2 = new FieldMapRecord(complexMap2, new RecordSchema(longStringField, longField)); // Create a Union Field that indicates that the type could be either 'complex 1' or 'complex 2' final UnionRecordField unionRecordField = new UnionRecordField("union", Repetition.ZERO_OR_MORE, Arrays.asList(new RecordField[] {complexField1, complexField2})); // Create a Record Schema final List<RecordField> fields = new ArrayList<>(); fields.add(new SimpleRecordField("int", FieldType.INT, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("int present", FieldType.INT, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("boolean", FieldType.BOOLEAN, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("boolean present", FieldType.BOOLEAN, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("byte array", FieldType.BYTE_ARRAY, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("byte array present", FieldType.BYTE_ARRAY, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("long", FieldType.LONG, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("long present", FieldType.LONG, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("string", FieldType.STRING, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("string present", FieldType.STRING, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("long string", FieldType.LONG_STRING, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("long string present", FieldType.LONG_STRING, Repetition.ZERO_OR_ONE)); fields.add(new ComplexRecordField("complex present", Repetition.EXACTLY_ONE, new SimpleRecordField("color", FieldType.STRING, Repetition.ZERO_OR_ONE), new SimpleRecordField("fruit", FieldType.STRING, Repetition.ZERO_OR_ONE))); fields.add(new MapRecordField("map present", new SimpleRecordField("key", FieldType.STRING, Repetition.EXACTLY_ONE), new SimpleRecordField("value", FieldType.INT, Repetition.EXACTLY_ONE), Repetition.ZERO_OR_ONE)); fields.add(unionRecordField); final RecordSchema schema = new RecordSchema(fields); // Create a 'complex' record that contains two different elements. final RecordField colorField = createField("color", FieldType.STRING); final RecordField fruitField = createField("fruit", FieldType.STRING); final Map<RecordField, Object> complexFieldMap = new LinkedHashMap<>(); complexFieldMap.put(colorField, "red"); complexFieldMap.put(fruitField, "apple"); // Create a simple map that can be used for a Map Field final Map<String, Integer> simpleMap = new HashMap<>(); simpleMap.put("apples", 100); // Create a Map of record fields to values, so that we can create a Record to write out final Map<RecordField, Object> values = new LinkedHashMap<>(); values.put(createField("int", FieldType.INT), 42); values.put(createField("int present", FieldType.INT), 42); values.put(createField("boolean present", FieldType.BOOLEAN), true); values.put(createField("byte array present", FieldType.BYTE_ARRAY), "Hello".getBytes()); values.put(createField("long present", FieldType.LONG), 42L); values.put(createField("string present", FieldType.STRING), "Hello"); values.put(createField("long string present", FieldType.LONG_STRING), "Long Hello"); values.put(createField("complex present", FieldType.COMPLEX), new FieldMapRecord(complexFieldMap, new RecordSchema(colorField, fruitField))); values.put(new MapRecordField("map present", createField("key", FieldType.STRING), createField("value", FieldType.INT), Repetition.EXACTLY_ONE), simpleMap); values.put(unionRecordField, Arrays.asList(new NamedValue[] { new NamedValue("complex1", complexRecord1), new NamedValue("complex2", complexRecord2)})); final FieldMapRecord originalRecord = new FieldMapRecord(values, schema); // Write out a record and read it back in. try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) { // Write the schema to the stream schema.writeTo(baos); // Write the record twice, to make sure that we're able to read/write multiple sequential records final SchemaRecordWriter writer = new SchemaRecordWriter(); writer.writeRecord(originalRecord, baos); writer.writeRecord(originalRecord, baos); try (final InputStream in = new ByteArrayInputStream(baos.toByteArray())) { // Read the Schema from the stream and create a Record Reader for reading records, based on this schema final RecordSchema readSchema = RecordSchema.readFrom(in); final SchemaRecordReader reader = SchemaRecordReader.fromSchema(readSchema); // Read two records and verify the values. for (int i=0; i < 2; i++) { final Record record = reader.readRecord(in); assertNotNull(record); assertEquals(42, record.getFieldValue("int")); assertEquals(42, record.getFieldValue("int present")); assertEquals(true, record.getFieldValue("boolean present")); assertTrue(Arrays.equals("Hello".getBytes(), (byte[]) record.getFieldValue("byte array present"))); assertEquals(42L, record.getFieldValue("long present")); assertEquals("Hello", record.getFieldValue("string present")); assertEquals("Long Hello", record.getFieldValue("long string present")); final Record complexRecord = (Record) record.getFieldValue("complex present"); assertEquals("red", complexRecord.getFieldValue("color")); assertEquals("apple", complexRecord.getFieldValue("fruit")); assertEquals(simpleMap, record.getFieldValue("map present")); final List<Record> unionRecords = (List<Record>) record.getFieldValue("union"); assertNotNull(unionRecords); assertEquals(2, unionRecords.size()); final Record unionRecord1 = unionRecords.get(0); assertEquals("apples", unionRecord1.getFieldValue("string field")); assertEquals(100, unionRecord1.getFieldValue("int field")); final Record unionRecord2 = unionRecords.get(1); assertEquals("oranges", unionRecord2.getFieldValue("long string field")); assertEquals(Long.MAX_VALUE, unionRecord2.getFieldValue("long field")); } // Ensure that there is no more data. assertNull(reader.readRecord(in)); } } } @Test @SuppressWarnings("unchecked") public void testUTFLargerThan64k() throws IOException { // Create a Record Schema final List<RecordField> fields = new ArrayList<>(); fields.add(new SimpleRecordField("int present", FieldType.INT, Repetition.ZERO_OR_ONE)); fields.add(new SimpleRecordField("string present", FieldType.STRING, Repetition.ZERO_OR_ONE)); final RecordSchema schema = new RecordSchema(fields); // Create a Map of record fields to values, so that we can create a Record to write out final Map<RecordField, Object> values = new LinkedHashMap<>(); values.put(createField("int present", FieldType.INT), 42); final String utfString = utfStringOneByte + utfStringTwoByte + utfStringThreeByte; // 3 chars and 6 utf8 bytes final String seventyK = StringUtils.repeat(utfString, 21845); // 65,535 chars and 131070 utf8 bytes assertTrue(seventyK.length() == 65535); assertTrue(seventyK.getBytes("UTF-8").length == 131070); values.put(createField("string present", FieldType.STRING), seventyK); final FieldMapRecord originalRecord = new FieldMapRecord(values, schema); // Write out a record and read it back in. try (final ByteArrayOutputStream baos = new ByteArrayOutputStream()) { // Write the schema to the stream schema.writeTo(baos); // Write the record twice, to make sure that we're able to read/write multiple sequential records final SchemaRecordWriter writer = new SchemaRecordWriter(); writer.writeRecord(originalRecord, baos); writer.writeRecord(originalRecord, baos); try (final InputStream in = new ByteArrayInputStream(baos.toByteArray())) { // Read the Schema from the stream and create a Record Reader for reading records, based on this schema final RecordSchema readSchema = RecordSchema.readFrom(in); final SchemaRecordReader reader = SchemaRecordReader.fromSchema(readSchema); // Read the records and verify the values. for (int i=0; i < 2; i++) { final Record record = reader.readRecord(in); assertNotNull(record); assertEquals(42, record.getFieldValue("int present")); assertTrue(MAX_ALLOWED_UTF_LENGTH - ((String)record.getFieldValue("string present")).getBytes("utf-8").length <= 3); assertEquals(32768, ((String)record.getFieldValue("string present")).length()); } // Ensure that there is no more data. assertNull(reader.readRecord(in)); } } } @Test public void testSingleCharUTF8Lengths() { // verify handling of single characters mapping to utf8 byte strings assertEquals("test 1 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 0)); assertEquals("test 2 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 0)); assertEquals("test 3 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 0)); assertEquals("test 1 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 1)); assertEquals("test 2 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 1)); assertEquals("test 3 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 1)); assertEquals("test 1 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 2)); assertEquals("test 2 char string truncated to 2 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 2)); assertEquals("test 3 char string truncated to 2 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 2)); assertEquals("test 1 char string truncated to 3 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 3)); assertEquals("test 2 char string truncated to 3 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 3)); assertEquals("test 3 char string truncated to 3 utf bytes should be 3", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 3)); } @Test public void testMultiCharUTFLengths() { // test boundary conditions as 1, 2, and 3 UTF byte chars are included into utf limit positions used by strings final String testString1 = utfStringOneByte + utfStringTwoByte + utfStringThreeByte; // char 'abc' utf 'abbccc' assertEquals("test 6 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 0)); // utf '' assertEquals("test 6 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 1)); // utf 'a' assertEquals("test 6 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 2)); // utf 'a' assertEquals("test 6 char string truncated to 3 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 3)); // utf 'abb' assertEquals("test 6 char string truncated to 4 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 4)); // utf 'abb' assertEquals("test 6 char string truncated to 5 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 5)); // utf 'abb' assertEquals("test 6 char string truncated to 6 utf bytes should be 3", 3, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 6)); // utf 'abbccc' } @Test public void testSmallCharUTFLengths() throws UnsupportedEncodingException { final String string12b = StringUtils.repeat(utfStringOneByte + utfStringTwoByte + utfStringThreeByte, 2); assertEquals("test multi-char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 0)); assertEquals("test multi-char string truncated to 1 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 1)); assertEquals("test multi-char string truncated to 2 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 2)); assertEquals("test multi-char string truncated to 3 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 3)); assertEquals("test multi-char string truncated to 4 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 4)); assertEquals("test multi-char string truncated to 5 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 5)); assertEquals("test multi-char string truncated to 6 utf bytes should be 0", 3, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 6)); assertEquals("test multi-char string truncated to 7 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 7)); assertEquals("test multi-char string truncated to 8 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 8)); assertEquals("test multi-char string truncated to 9 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 9)); assertEquals("test multi-char string truncated to 10 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 10)); assertEquals("test multi-char string truncated to 11 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 11)); assertEquals("test multi-char string truncated to 12 utf bytes should be 0", 6, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 12)); } @Test public void testLargeCharUTFLengths() { final String string64k = StringUtils.repeat(utfStringOneByte + utfStringTwoByte + utfStringThreeByte, 21845); assertEquals("test 64k char string should be 64k chars long", 65535, string64k.length()); // drop half the chars going to utf of 64k bytes -- (1+1+1) * 21845 = 65535 chars which converts to (1+2+3) * 21845 = 131070 utf bytes so 1/2 is truncated assertEquals("test 64k char string truncated to 65,535 utf bytes should be 32768", 32768, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65535)); // dropping bytes off the end of utf length assertEquals("test 64k char string truncated to 65,534 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65534)); // lost 2 byte char assertEquals("test 64k char string truncated to 65,533 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65533)); assertEquals("test 64k char string truncated to 65,532 utf bytes should be 32766", 32766, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65532)); // lost 1 byte char assertEquals("test 64k char string truncated to 65,531 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65531)); // lost 3 byte char assertEquals("test 64k char string truncated to 65,530 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65530)); assertEquals("test 64k char string truncated to 65,529 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65529)); assertEquals("test 64k char string truncated to 65,528 utf bytes should be 32764", 32764, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65528)); // lost 2 byte char (again) } private SimpleRecordField createField(final String fieldName, final FieldType type) { return new SimpleRecordField(fieldName, type, Repetition.ZERO_OR_ONE); } }