package gobblin.compaction.mapreduce; import com.google.common.io.Files; import gobblin.compaction.dataset.TimeBasedSubDirDatasetsFinder; import gobblin.compaction.source.CompactionSource; import gobblin.compaction.verify.InputRecordCountHelper; import gobblin.configuration.ConfigurationKeys; import gobblin.data.management.retention.profile.ConfigurableGlobDatasetFinder; import gobblin.runtime.api.JobExecutionResult; import gobblin.runtime.embedded.EmbeddedGobblin; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.URI; import lombok.extern.slf4j.Slf4j; @Slf4j public class MRCompactionTaskTest { protected FileSystem getFileSystem() throws IOException { String uri = ConfigurationKeys.LOCAL_FS_URI; FileSystem fs = FileSystem.get(URI.create(uri), new Configuration()); return fs; } @Test public void testDedup() throws Exception { File basePath = Files.createTempDir(); basePath.deleteOnExit(); File jobDir = new File(basePath, "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20"); Assert.assertTrue(jobDir.mkdirs()); GenericRecord r1 = createRandomRecord(); GenericRecord r2 = createRandomRecord(); writeFileWithContent(jobDir, "file1", r1, 20); writeFileWithContent(jobDir, "file2", r2, 18); EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin("dedup", basePath.getAbsolutePath().toString()); JobExecutionResult result = embeddedGobblin.run(); Assert.assertTrue(result.isSuccessful()); } @Test public void testNonDedup() throws Exception { File basePath = Files.createTempDir(); basePath.deleteOnExit(); File jobDir = new File(basePath, "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20"); Assert.assertTrue(jobDir.mkdirs()); GenericRecord r1 = createRandomRecord(); GenericRecord r2 = createRandomRecord(); writeFileWithContent(jobDir, "file1", r1, 20); writeFileWithContent(jobDir, "file2", r2, 18); EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin("non-dedup", basePath.getAbsolutePath().toString()); JobExecutionResult result = embeddedGobblin.run(); Assert.assertTrue(result.isSuccessful()); } @Test public void testRecompaction () throws Exception { FileSystem fs = getFileSystem(); String basePath = "/tmp/testRecompaction"; fs.delete(new Path(basePath), true); File jobDir = new File(basePath, "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20"); Assert.assertTrue(jobDir.mkdirs()); GenericRecord r1 = createRandomRecord(); writeFileWithContent(jobDir, "file1", r1, 20); EmbeddedGobblin embeddedGobblin = createEmbeddedGobblin ("Recompaction-First", basePath); JobExecutionResult result = embeddedGobblin.run(); long recordCount = InputRecordCountHelper.readRecordCount(fs, (new Path (basePath, new Path("Identity/MemberAccount/hourly/2017/04/03/10")))); Assert.assertTrue(result.isSuccessful()); Assert.assertEquals(recordCount, 20); // Now write more avro files to input dir writeFileWithContent(jobDir, "file2", r1, 22); EmbeddedGobblin embeddedGobblin_2 = createEmbeddedGobblin ("Recompaction-Second", basePath); embeddedGobblin_2.run(); Assert.assertTrue(result.isSuccessful()); // If recompaction is succeeded, a new record count should be written. recordCount = InputRecordCountHelper.readRecordCount(fs, (new Path (basePath, new Path("Identity/MemberAccount/hourly/2017/04/03/10")))); Assert.assertEquals(recordCount, 42); Assert.assertTrue(fs.exists(new Path (basePath, "Identity/MemberAccount/hourly/2017/04/03/10"))); } private void writeFileWithContent(File dir, String fileName, GenericRecord r, int count) throws IOException { File file = new File(dir, fileName + "." + count + ".avro"); Assert.assertTrue(file.createNewFile()); this.createAvroFileWithRepeatingRecords(file, r, count); } public Schema getSchema() { final String KEY_SCHEMA = "{ \"type\" : \"record\", \"name\" : \"etl\",\"namespace\" : \"reducerTest\", \"fields\" : [ { \"name\" : " + "\"key\", \"type\" : {\"type\" : \"record\", \"name\" : \"key_name\", \"namespace\" : \"key_namespace\", " + "\"fields\" : [ {\"name\" : \"partitionKey\", \"type\" : \"long\", \"doc\" : \"\"}, { \"name\" : \"environment" + "\", \"type\" : \"string\",\"doc\" : \"\"}, {\"name\" : \"subKey\",\"type\" : \"string\", \"doc\" : \"\"} ]}, " + "\"doc\" : \"\", \"attributes_json\" : \"{\\\"delta\\\":false,\\\"pk\\\":true}\" }]}"; Schema keySchema = new Schema.Parser().parse(KEY_SCHEMA); return keySchema.getField("key").schema(); } public GenericRecord createRandomRecord () { GenericRecordBuilder keyRecordBuilder = new GenericRecordBuilder(getSchema()); keyRecordBuilder.set("partitionKey", new Long(1)); keyRecordBuilder.set("environment", "test"); keyRecordBuilder.set("subKey", "2"); GenericRecord record = keyRecordBuilder.build(); return record; } public void createAvroFileWithRepeatingRecords(File file, GenericRecord r, int count) throws IOException { DataFileWriter<GenericRecord> writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>()); writer.create(getSchema(), new FileOutputStream(file)); for (int i = 0; i < count; ++i) { writer.append(r); } writer.close(); } private EmbeddedGobblin createEmbeddedGobblin (String name, String basePath) { String pattern = new Path(basePath, "*/*/minutely/*/*/*/*").toString(); return new EmbeddedGobblin(name) .setConfiguration(ConfigurationKeys.SOURCE_CLASS_KEY, CompactionSource.class.getName()) .setConfiguration(ConfigurableGlobDatasetFinder.DATASET_FINDER_PATTERN_KEY, pattern) .setConfiguration(MRCompactor.COMPACTION_INPUT_DIR, basePath.toString()) .setConfiguration(MRCompactor.COMPACTION_INPUT_SUBDIR, "minutely") .setConfiguration(MRCompactor.COMPACTION_DEST_DIR, basePath.toString()) .setConfiguration(MRCompactor.COMPACTION_DEST_SUBDIR, "hourly") .setConfiguration(MRCompactor.COMPACTION_TMP_DEST_DIR, "/tmp/compaction/" + name) .setConfiguration(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MAX_TIME_AGO, "3000d") .setConfiguration(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MIN_TIME_AGO, "1d"); } }