/* * Copyright (C) 2016 Lorand Bendig All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package gobblin.source.extractor; import java.io.File; import java.io.IOException; import java.util.List; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; import org.apache.commons.io.FileUtils; import org.apache.hadoop.fs.Path; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.junit.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.SourceState; import gobblin.configuration.State; import gobblin.configuration.WorkUnitState; import gobblin.source.DatePartitionedAvroFileSource; import gobblin.source.workunit.Extract.TableType; import gobblin.source.workunit.MultiWorkUnit; import gobblin.source.workunit.WorkUnit; import gobblin.writer.AvroDataWriterBuilder; import gobblin.writer.DataWriter; import gobblin.writer.DataWriterBuilder; import gobblin.writer.Destination; import gobblin.writer.PartitionedDataWriter; import gobblin.writer.WriterOutputFormat; import gobblin.writer.partitioner.TimeBasedAvroWriterPartitioner; import gobblin.writer.partitioner.TimeBasedWriterPartitioner; /** * Unit tests for {@link DatePartitionedAvroFileExtractor}. * * @author Lorand Bendig */ @Test(groups = { "gobblin.source.extractor." }) public class DatePartitionedAvroFileExtractorTest { private static final String SIMPLE_CLASS_NAME = DatePartitionedAvroFileExtractorTest.class.getSimpleName(); private static final String TEST_ROOT_DIR = "/tmp/" + SIMPLE_CLASS_NAME + "-test"; private static final String STAGING_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "staging"; private static final String OUTPUT_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "job-output"; private static final String FILE_NAME = SIMPLE_CLASS_NAME + "-name.avro"; private static final String PARTITION_COLUMN_NAME = "timestamp"; private static final String PREFIX = "minutes"; private static final String SUFFIX = "test"; private static final String SOURCE_ENTITY = "testsource"; private static final String DATE_PATTERN = "yyyy/MM/dd/HH_mm"; private static final int RECORD_SIZE = 4; private static final String AVRO_SCHEMA = "{" + "\"type\" : \"record\"," + "\"name\" : \"User\"," + "\"namespace\" : \"example.avro\"," + "\"fields\" : [" + "{" + "\"name\" : \"" + PARTITION_COLUMN_NAME + "\"," + "\"type\" : \"long\"" + "}" + "]" + "}"; private Schema schema; private DataWriter<GenericRecord> writer; private DateTime startDateTime; private long[] recordTimestamps = new long[RECORD_SIZE]; private static final DateTimeZone TZ = DateTimeZone.forID(ConfigurationKeys.PST_TIMEZONE_NAME); @BeforeClass public void setUp() throws IOException { this.schema = new Schema.Parser().parse(AVRO_SCHEMA); //set up datetime objects DateTime now = new DateTime(TZ).minusHours(2); this.startDateTime = new DateTime(now.getYear(), now.getMonthOfYear(), now.getDayOfMonth(), now.getHourOfDay(), 30, 0, TZ); //create records, shift their timestamp by 1 minute DateTime recordDt = startDateTime; for (int i = 0; i < RECORD_SIZE; i++) { recordDt = recordDt.plusMinutes(1); recordTimestamps[i] = recordDt.getMillis(); } // create dummy data partitioned by minutes State state = new State(); state.setProp(TimeBasedAvroWriterPartitioner.WRITER_PARTITION_COLUMNS, PARTITION_COLUMN_NAME); state.setProp(ConfigurationKeys.WRITER_BUFFER_SIZE, ConfigurationKeys.DEFAULT_BUFFER_SIZE); state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, STAGING_DIR); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, OUTPUT_DIR); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, SOURCE_ENTITY); state.setProp(ConfigurationKeys.WRITER_FILE_NAME, FILE_NAME); state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_PATTERN, DATE_PATTERN); state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_PREFIX, PREFIX); state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_SUFFIX, SUFFIX); state.setProp(ConfigurationKeys.WRITER_PARTITIONER_CLASS, TimeBasedAvroWriterPartitioner.class.getName()); DataWriterBuilder<Schema, GenericRecord> builder = new AvroDataWriterBuilder() .writeTo(Destination.of(Destination.DestinationType.HDFS, state)) .writeInFormat(WriterOutputFormat.AVRO) .withWriterId("writer-1") .withSchema(this.schema) .withBranches(1).forBranch(0); this.writer = new PartitionedDataWriter<Schema, GenericRecord>(builder, state); GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(this.schema); for (int i = 0; i < RECORD_SIZE; i++) { genericRecordBuilder.set(PARTITION_COLUMN_NAME, recordTimestamps[i]); this.writer.write(genericRecordBuilder.build()); } this.writer.close(); this.writer.commit(); } @Test public void testReadPartitionsByMinute() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print( this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.prefix", PREFIX); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); verifyWorkUnits(workunits); } @Test public void testWorksNoPrefix() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY + Path.SEPARATOR + PREFIX); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print( this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); verifyWorkUnits(workunits); } private void verifyWorkUnits(List<WorkUnit> workunits) throws IOException, DataRecordException { for (int i = 0; i < RECORD_SIZE; i++) { WorkUnit workUnit = ((MultiWorkUnit) workunits.get(i)).getWorkUnits().get(0); WorkUnitState wuState = new WorkUnitState(workunits.get(i), new State()); wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)); try (DatePartitionedAvroFileExtractor extractor = new DatePartitionedAvroFileExtractor(wuState);) { GenericRecord record = extractor.readRecord(null); Assert.assertEquals(recordTimestamps[i], record.get(PARTITION_COLUMN_NAME)); Assert.assertEquals(recordTimestamps[i], workUnit.getPropAsLong(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY)); } } } @AfterClass public void tearDown() throws IOException { this.writer.close(); FileUtils.deleteDirectory(new File(TEST_ROOT_DIR)); } }