/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.hadoop.rdf.io.input; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; import org.apache.hadoop.mapreduce.task.JobContextImpl; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.jena.hadoop.rdf.io.HadoopIOConstants; import org.apache.jena.hadoop.rdf.io.RdfIOConstants; import org.apache.jena.hadoop.rdf.types.AbstractNodeTupleWritable; import org.junit.* ; import org.junit.rules.TemporaryFolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Abstract node tuple input format tests * * * * @param <TValue> * @param <T> */ public abstract class AbstractNodeTupleInputFormatTests<TValue, T extends AbstractNodeTupleWritable<TValue>> { private static final Logger LOG = LoggerFactory.getLogger(AbstractNodeTupleInputFormatTests.class); protected static final int EMPTY_SIZE = 0, SMALL_SIZE = 100, LARGE_SIZE = 10000, BAD_SIZE = 100, MIXED_SIZE = 100; protected static final String EMPTY = "empty"; protected static final String SMALL = "small"; protected static final String LARGE = "large"; protected static final String BAD = "bad"; protected static final String MIXED = "mixed"; /** * Temporary folder for the tests */ @Rule public TemporaryFolder folder = new TemporaryFolder(); protected File empty, small, large, bad, mixed; /** * Prepares the inputs for the tests * * @throws IOException */ @Before public void beforeTest() throws IOException { this.prepareInputs(); } /** * Cleans up the inputs after each test */ @After public void afterTest() { // Should be unnecessary since JUnit will clean up the temporary folder // anyway but best to do this regardless if (empty != null) empty.delete(); if (small != null) small.delete(); if (large != null) large.delete(); if (bad != null) bad.delete(); if (mixed != null) mixed.delete(); } /** * Prepares a fresh configuration * * @return Configuration */ protected Configuration prepareConfiguration() { Configuration config = new Configuration(true); // Nothing else to do return config; } /** * Prepares the inputs * * @throws IOException */ protected void prepareInputs() throws IOException { String ext = this.getFileExtension(); empty = folder.newFile(EMPTY + ext); this.generateTuples(empty, EMPTY_SIZE); small = folder.newFile(SMALL + ext); this.generateTuples(small, SMALL_SIZE); large = folder.newFile(LARGE + ext); this.generateTuples(large, LARGE_SIZE); bad = folder.newFile(BAD + ext); this.generateBadTuples(bad, BAD_SIZE); mixed = folder.newFile(MIXED + ext); this.generateMixedTuples(mixed, MIXED_SIZE); } /** * Gets the extra file extension to add to the filenames * * @return File extension */ protected abstract String getFileExtension(); /** * Generates tuples used for tests * * @param f * File * @param num * Number of tuples to generate * @throws IOException */ protected final void generateTuples(File f, int num) throws IOException { this.generateTuples(this.getOutputStream(f), num); } /** * Gets the output stream to use for generating tuples * * @param f * File * @return Output Stream * @throws IOException */ protected OutputStream getOutputStream(File f) throws IOException { return new FileOutputStream(f, false); } /** * Generates tuples used for tests * * @param output * Output Stream to write to * @param num * Number of tuples to generate * @throws IOException */ protected abstract void generateTuples(OutputStream output, int num) throws IOException; /** * Generates bad tuples used for tests * * @param f * File * @param num * Number of bad tuples to generate * @throws IOException */ protected final void generateBadTuples(File f, int num) throws IOException { this.generateBadTuples(this.getOutputStream(f), num); } /** * Generates bad tuples used for tests * * @param output * Output Stream to write to * @param num * Number of bad tuples to generate * @throws IOException */ protected abstract void generateBadTuples(OutputStream output, int num) throws IOException; /** * Generates a mixture of good and bad tuples used for tests * * @param f * File * @param num * Number of tuples to generate, they should be a 50/50 mix of * good and bad tuples * @throws IOException */ protected final void generateMixedTuples(File f, int num) throws IOException { this.generateMixedTuples(this.getOutputStream(f), num); } /** * Generates a mixture of good and bad tuples used for tests * * @param output * Output Stream to write to * @param num * Number of tuples to generate, they should be a 50/50 mix of * good and bad tuples * @throws IOException */ protected abstract void generateMixedTuples(OutputStream output, int num) throws IOException; /** * Adds an input path to the job configuration * * @param f * File * @param config * Configuration * @param job * Job * @throws IOException */ protected void addInputPath(File f, Configuration config, Job job) throws IOException { FileSystem fs = FileSystem.getLocal(config); Path inputPath = fs.makeQualified(new Path(f.getAbsolutePath())); FileInputFormat.addInputPath(job, inputPath); } protected final int countTuples(RecordReader<LongWritable, T> reader) throws IOException, InterruptedException { int count = 0; // Check initial progress LOG.info(String.format("Initial Reported Progress %f", reader.getProgress())); float progress = reader.getProgress(); if (Float.compare(0.0f, progress) == 0) { Assert.assertEquals(0.0d, reader.getProgress(), 0.0d); } else if (Float.compare(1.0f, progress) == 0) { // If reader is reported 1.0 straight away then we expect there to // be no key values Assert.assertEquals(1.0d, reader.getProgress(), 0.0d); Assert.assertFalse(reader.nextKeyValue()); } else { Assert.fail(String.format( "Expected progress of 0.0 or 1.0 before reader has been accessed for first time but got %f", progress)); } // Count tuples boolean debug = LOG.isDebugEnabled(); while (reader.nextKeyValue()) { count++; progress = reader.getProgress(); if (debug) LOG.debug(String.format("Current Reported Progress %f", progress)); Assert.assertTrue(String.format("Progress should be in the range 0.0 < p <= 1.0 but got %f", progress), progress > 0.0f && progress <= 1.0f); } reader.close(); LOG.info(String.format("Got %d tuples from this record reader", count)); // Check final progress LOG.info(String.format("Final Reported Progress %f", reader.getProgress())); Assert.assertEquals(1.0d, reader.getProgress(), 0.0d); return count; } protected final void checkTuples(RecordReader<LongWritable, T> reader, int expected) throws IOException, InterruptedException { Assert.assertEquals(expected, this.countTuples(reader)); } /** * Runs a test with a single input * * @param input * Input * @param expectedTuples * Expected tuples * @throws IOException * @throws InterruptedException */ protected final void testSingleInput(File input, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Prepare configuration Configuration config = this.prepareConfiguration(); this.testSingleInput(config, input, expectedSplits, expectedTuples); } /** * Runs a test with a single input * * @param config * Configuration * @param input * Input * @param expectedTuples * Expected tuples * @throws IOException * @throws InterruptedException */ protected final void testSingleInput(Configuration config, File input, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); this.addInputPath(input, job.getConfiguration(), job); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(1, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, LARGE_SIZE); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); this.checkTuples(reader, expectedTuples); } } protected abstract InputFormat<LongWritable, T> getInputFormat(); /** * Basic tuples input test * * @throws IOException * @throws InterruptedException */ @Test public final void single_input_01() throws IOException, InterruptedException { testSingleInput(empty, this.canSplitInputs() ? 0 : 1, EMPTY_SIZE); } /** * Basic tuples input test * * @throws IOException * @throws InterruptedException */ @Test public final void single_input_02() throws IOException, InterruptedException { testSingleInput(small, 1, SMALL_SIZE); } /** * Basic tuples input test * * @throws IOException * @throws InterruptedException */ @Test public final void single_input_03() throws IOException, InterruptedException { testSingleInput(large, 1, LARGE_SIZE); } /** * Basic tuples input test * * @throws IOException * @throws InterruptedException */ @Test public final void single_input_04() throws IOException, InterruptedException { testSingleInput(bad, 1, 0); } /** * Basic tuples input test * * @throws IOException * @throws InterruptedException */ @Test public final void single_input_05() throws IOException, InterruptedException { // JSON-LD overrides this because in JSON-LD parsing a bad document gives no triples. int x = single_input_05_expected() ; testSingleInput(mixed, 1, x); } /** Results exected for test single_input_05 */ protected int single_input_05_expected() { return MIXED_SIZE / 2 ; } /** * Tests behaviour when ignoring bad tuples is disabled * * @throws InterruptedException * @throws IOException */ @Test(expected = IOException.class) public final void fail_on_bad_input_01() throws IOException, InterruptedException { Configuration config = this.prepareConfiguration(); config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false); Assert.assertFalse(config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true)); testSingleInput(config, bad, 1, 0); } /** * Tests behaviour when ignoring bad tuples is disabled * * @throws InterruptedException * @throws IOException */ @Test(expected = IOException.class) public final void fail_on_bad_input_02() throws IOException, InterruptedException { Configuration config = this.prepareConfiguration(); config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false); Assert.assertFalse(config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true)); testSingleInput(config, mixed, 1, MIXED_SIZE / 2); } /** * Runs a multiple input test * * @param inputs * Inputs * @param expectedSplits * Number of splits expected * @param expectedTuples * Number of tuples expected * @throws IOException * @throws InterruptedException */ protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Prepare configuration and inputs Configuration config = this.prepareConfiguration(); // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, expectedTuples); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); } /** * tuples test with multiple inputs * * @throws IOException * @throws InterruptedException */ @Test public final void multiple_inputs_01() throws IOException, InterruptedException { testMultipleInputs(new File[] { empty, small, large }, this.canSplitInputs() ? 2 : 3, EMPTY_SIZE + SMALL_SIZE + LARGE_SIZE); } /** * tuples test with multiple inputs * * @throws IOException * @throws InterruptedException */ @Test public final void multiple_inputs_02() throws IOException, InterruptedException { int expectedTriples = multiple_inputs_02_expected() ; testMultipleInputs(new File[] { folder.getRoot() }, this.canSplitInputs() ? 4 : 5, expectedTriples); } /** Results exected for test multiple_inputs_02. * JSON_LD has different characteristics on bad documents. * See {@link #single_input_05}. */ protected int multiple_inputs_02_expected() { return EMPTY_SIZE + SMALL_SIZE + LARGE_SIZE + (MIXED_SIZE / 2) ; } protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { // Validate split Assert.assertTrue(this.isValidSplit(split, config)); // Read split TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); } /** * Determines whether an input split is valid * * @param split * Input split * @return True if a valid split, false otherwise */ protected boolean isValidSplit(InputSplit split, Configuration config) { return split instanceof FileSplit; } /** * Indicates whether inputs can be split, defaults to true * * @return Whether inputs can be split */ protected boolean canSplitInputs() { return true; } /** * Tests for input splitting * * @throws IOException * @throws InterruptedException */ @Test public final void split_input_01() throws IOException, InterruptedException { Assume.assumeTrue(this.canSplitInputs()); Configuration config = this.prepareConfiguration(); config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false); Assert.assertEquals(Integer.MAX_VALUE, config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE)); this.testSplitInputs(config, new File[] { small }, 100, SMALL_SIZE); } /** * Tests for input splitting * * @throws IOException * @throws InterruptedException */ @Test public final void split_input_02() throws IOException, InterruptedException { Assume.assumeTrue(this.canSplitInputs()); Configuration config = this.prepareConfiguration(); config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false); config.setLong(NLineInputFormat.LINES_PER_MAP, 10); Assert.assertEquals(Integer.MAX_VALUE, config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE)); this.testSplitInputs(config, new File[] { small }, 10, SMALL_SIZE); } /** * Tests for input splitting * * @throws IOException * @throws InterruptedException */ @Test public final void split_input_03() throws IOException, InterruptedException { Assume.assumeTrue(this.canSplitInputs()); Configuration config = this.prepareConfiguration(); config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false); config.setLong(NLineInputFormat.LINES_PER_MAP, 100); Assert.assertEquals(Integer.MAX_VALUE, config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE)); this.testSplitInputs(config, new File[] { large }, 100, LARGE_SIZE); } }