AbstractNodeTupleInputFormatTests.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 *     
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.hadoop.rdf.io.input;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.task.JobContextImpl;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.jena.hadoop.rdf.io.HadoopIOConstants;
import org.apache.jena.hadoop.rdf.io.RdfIOConstants;
import org.apache.jena.hadoop.rdf.types.AbstractNodeTupleWritable;
import org.junit.* ;
import org.junit.rules.TemporaryFolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Abstract node tuple input format tests
 * 
 * 
 * 
 * @param <TValue>
 * @param <T>
 */
public abstract class AbstractNodeTupleInputFormatTests<TValue, T extends AbstractNodeTupleWritable<TValue>> {

    private static final Logger LOG = LoggerFactory.getLogger(AbstractNodeTupleInputFormatTests.class);

    protected static final int EMPTY_SIZE = 0, SMALL_SIZE = 100, LARGE_SIZE = 10000, BAD_SIZE = 100, MIXED_SIZE = 100;
    protected static final String EMPTY = "empty";
    protected static final String SMALL = "small";
    protected static final String LARGE = "large";
    protected static final String BAD = "bad";
    protected static final String MIXED = "mixed";

    /**
     * Temporary folder for the tests
     */
    @Rule
    public TemporaryFolder folder = new TemporaryFolder();

    protected File empty, small, large, bad, mixed;

    /**
     * Prepares the inputs for the tests
     * 
     * @throws IOException
     */
    @Before
    public void beforeTest() throws IOException {
        this.prepareInputs();
    }

    /**
     * Cleans up the inputs after each test
     */
    @After
    public void afterTest() {
        // Should be unnecessary since JUnit will clean up the temporary folder
        // anyway but best to do this regardless
        if (empty != null)
            empty.delete();
        if (small != null)
            small.delete();
        if (large != null)
            large.delete();
        if (bad != null)
            bad.delete();
        if (mixed != null)
            mixed.delete();
    }

    /**
     * Prepares a fresh configuration
     * 
     * @return Configuration
     */
    protected Configuration prepareConfiguration() {
        Configuration config = new Configuration(true);
        // Nothing else to do
        return config;
    }

    /**
     * Prepares the inputs
     * 
     * @throws IOException
     */
    protected void prepareInputs() throws IOException {
        String ext = this.getFileExtension();
        empty = folder.newFile(EMPTY + ext);
        this.generateTuples(empty, EMPTY_SIZE);
        small = folder.newFile(SMALL + ext);
        this.generateTuples(small, SMALL_SIZE);
        large = folder.newFile(LARGE + ext);
        this.generateTuples(large, LARGE_SIZE);
        bad = folder.newFile(BAD + ext);
        this.generateBadTuples(bad, BAD_SIZE);
        mixed = folder.newFile(MIXED + ext);
        this.generateMixedTuples(mixed, MIXED_SIZE);
    }

    /**
     * Gets the extra file extension to add to the filenames
     * 
     * @return File extension
     */
    protected abstract String getFileExtension();

    /**
     * Generates tuples used for tests
     * 
     * @param f
     *            File
     * @param num
     *            Number of tuples to generate
     * @throws IOException
     */
    protected final void generateTuples(File f, int num) throws IOException {
        this.generateTuples(this.getOutputStream(f), num);
    }

    /**
     * Gets the output stream to use for generating tuples
     * 
     * @param f
     *            File
     * @return Output Stream
     * @throws IOException
     */
    protected OutputStream getOutputStream(File f) throws IOException {
        return new FileOutputStream(f, false);
    }

    /**
     * Generates tuples used for tests
     * 
     * @param output
     *            Output Stream to write to
     * @param num
     *            Number of tuples to generate
     * @throws IOException
     */
    protected abstract void generateTuples(OutputStream output, int num) throws IOException;

    /**
     * Generates bad tuples used for tests
     * 
     * @param f
     *            File
     * @param num
     *            Number of bad tuples to generate
     * @throws IOException
     */
    protected final void generateBadTuples(File f, int num) throws IOException {
        this.generateBadTuples(this.getOutputStream(f), num);
    }

    /**
     * Generates bad tuples used for tests
     * 
     * @param output
     *            Output Stream to write to
     * @param num
     *            Number of bad tuples to generate
     * @throws IOException
     */
    protected abstract void generateBadTuples(OutputStream output, int num) throws IOException;

    /**
     * Generates a mixture of good and bad tuples used for tests
     * 
     * @param f
     *            File
     * @param num
     *            Number of tuples to generate, they should be a 50/50 mix of
     *            good and bad tuples
     * @throws IOException
     */
    protected final void generateMixedTuples(File f, int num) throws IOException {
        this.generateMixedTuples(this.getOutputStream(f), num);
    }

    /**
     * Generates a mixture of good and bad tuples used for tests
     * 
     * @param output
     *            Output Stream to write to
     * @param num
     *            Number of tuples to generate, they should be a 50/50 mix of
     *            good and bad tuples
     * @throws IOException
     */
    protected abstract void generateMixedTuples(OutputStream output, int num) throws IOException;

    /**
     * Adds an input path to the job configuration
     * 
     * @param f
     *            File
     * @param config
     *            Configuration
     * @param job
     *            Job
     * @throws IOException
     */
    protected void addInputPath(File f, Configuration config, Job job) throws IOException {
        FileSystem fs = FileSystem.getLocal(config);
        Path inputPath = fs.makeQualified(new Path(f.getAbsolutePath()));
        FileInputFormat.addInputPath(job, inputPath);
    }

    protected final int countTuples(RecordReader<LongWritable, T> reader) throws IOException, InterruptedException {
        int count = 0;

        // Check initial progress
        LOG.info(String.format("Initial Reported Progress %f", reader.getProgress()));
        float progress = reader.getProgress();
        if (Float.compare(0.0f, progress) == 0) {
            Assert.assertEquals(0.0d, reader.getProgress(), 0.0d);
        } else if (Float.compare(1.0f, progress) == 0) {
            // If reader is reported 1.0 straight away then we expect there to
            // be no key values
            Assert.assertEquals(1.0d, reader.getProgress(), 0.0d);
            Assert.assertFalse(reader.nextKeyValue());
        } else {
            Assert.fail(String.format(
                    "Expected progress of 0.0 or 1.0 before reader has been accessed for first time but got %f",
                    progress));
        }

        // Count tuples
        boolean debug = LOG.isDebugEnabled();
        while (reader.nextKeyValue()) {
            count++;
            progress = reader.getProgress();
            if (debug)
                LOG.debug(String.format("Current Reported Progress %f", progress));
            Assert.assertTrue(String.format("Progress should be in the range 0.0 < p <= 1.0 but got %f", progress),
                    progress > 0.0f && progress <= 1.0f);
        }
        reader.close();
        LOG.info(String.format("Got %d tuples from this record reader", count));

        // Check final progress
        LOG.info(String.format("Final Reported Progress %f", reader.getProgress()));
        Assert.assertEquals(1.0d, reader.getProgress(), 0.0d);

        return count;
    }

    protected final void checkTuples(RecordReader<LongWritable, T> reader, int expected) throws IOException,
            InterruptedException {
        Assert.assertEquals(expected, this.countTuples(reader));
    }

    /**
     * Runs a test with a single input
     * 
     * @param input
     *            Input
     * @param expectedTuples
     *            Expected tuples
     * @throws IOException
     * @throws InterruptedException
     */
    protected final void testSingleInput(File input, int expectedSplits, int expectedTuples) throws IOException,
            InterruptedException {
        // Prepare configuration
        Configuration config = this.prepareConfiguration();
        this.testSingleInput(config, input, expectedSplits, expectedTuples);
    }

    /**
     * Runs a test with a single input
     * 
     * @param config
     *            Configuration
     * @param input
     *            Input
     * @param expectedTuples
     *            Expected tuples
     * @throws IOException
     * @throws InterruptedException
     */
    protected final void testSingleInput(Configuration config, File input, int expectedSplits, int expectedTuples)
            throws IOException, InterruptedException {
        // Set up fake job
        InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        this.addInputPath(input, job.getConfiguration(), job);
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        Assert.assertEquals(1, FileInputFormat.getInputPaths(context).length);
        NLineInputFormat.setNumLinesPerSplit(job, LARGE_SIZE);

        // Check splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(expectedSplits, splits.size());

        // Check tuples
        for (InputSplit split : splits) {
            TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
            RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
            reader.initialize(split, taskContext);
            this.checkTuples(reader, expectedTuples);
        }
    }

    protected abstract InputFormat<LongWritable, T> getInputFormat();

    /**
     * Basic tuples input test
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void single_input_01() throws IOException, InterruptedException {
        testSingleInput(empty, this.canSplitInputs() ? 0 : 1, EMPTY_SIZE);
    }

    /**
     * Basic tuples input test
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void single_input_02() throws IOException, InterruptedException {
        testSingleInput(small, 1, SMALL_SIZE);
    }

    /**
     * Basic tuples input test
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void single_input_03() throws IOException, InterruptedException {
        testSingleInput(large, 1, LARGE_SIZE);
    }

    /**
     * Basic tuples input test
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void single_input_04() throws IOException, InterruptedException {
        testSingleInput(bad, 1, 0);
    }

    /**
     * Basic tuples input test
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void single_input_05() throws IOException, InterruptedException {
        // JSON-LD overrides this because in JSON-LD parsing a bad document gives no triples. 
        int x = single_input_05_expected() ;
        testSingleInput(mixed, 1, x);
    }

    /** Results exected for test single_input_05 */ 
    protected int single_input_05_expected() {
        return MIXED_SIZE / 2 ;
    }

    /**
     * Tests behaviour when ignoring bad tuples is disabled
     * 
     * @throws InterruptedException
     * @throws IOException
     */
    @Test(expected = IOException.class)
    public final void fail_on_bad_input_01() throws IOException, InterruptedException {
        Configuration config = this.prepareConfiguration();
        config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false);
        Assert.assertFalse(config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true));
        testSingleInput(config, bad, 1, 0);
    }

    /**
     * Tests behaviour when ignoring bad tuples is disabled
     * 
     * @throws InterruptedException
     * @throws IOException
     */
    @Test(expected = IOException.class)
    public final void fail_on_bad_input_02() throws IOException, InterruptedException {
        Configuration config = this.prepareConfiguration();
        config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false);
        Assert.assertFalse(config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true));
        testSingleInput(config, mixed, 1, MIXED_SIZE / 2);
    }

    /**
     * Runs a multiple input test
     * 
     * @param inputs
     *            Inputs
     * @param expectedSplits
     *            Number of splits expected
     * @param expectedTuples
     *            Number of tuples expected
     * @throws IOException
     * @throws InterruptedException
     */
    protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException,
            InterruptedException {
        // Prepare configuration and inputs
        Configuration config = this.prepareConfiguration();

        // Set up fake job
        InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        for (File input : inputs) {
            this.addInputPath(input, job.getConfiguration(), job);
        }
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);
        NLineInputFormat.setNumLinesPerSplit(job, expectedTuples);

        // Check splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(expectedSplits, splits.size());

        // Check tuples
        int count = 0;
        for (InputSplit split : splits) {
            TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
            RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
            reader.initialize(split, taskContext);
            count += this.countTuples(reader);
        }
        Assert.assertEquals(expectedTuples, count);
    }

    /**
     * tuples test with multiple inputs
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void multiple_inputs_01() throws IOException, InterruptedException {
        testMultipleInputs(new File[] { empty, small, large }, this.canSplitInputs() ? 2 : 3, EMPTY_SIZE + SMALL_SIZE
                + LARGE_SIZE);
    }

    /**
     * tuples test with multiple inputs
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void multiple_inputs_02() throws IOException, InterruptedException {
        int expectedTriples = multiple_inputs_02_expected() ; 
        testMultipleInputs(new File[] { folder.getRoot() }, this.canSplitInputs() ? 4 : 5, expectedTriples);
    }

    /** Results exected for test multiple_inputs_02.
     * JSON_LD has different characteristics on bad documents.
     * See {@link #single_input_05}.
     */ 
    protected int multiple_inputs_02_expected() {
        return EMPTY_SIZE + SMALL_SIZE + LARGE_SIZE + (MIXED_SIZE / 2) ;
    }

    protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits, int expectedTuples)
            throws IOException, InterruptedException {
        // Set up fake job
        InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        for (File input : inputs) {
            this.addInputPath(input, job.getConfiguration(), job);
        }
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
        Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);

        // Check splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(expectedSplits, splits.size());

        // Check tuples
        int count = 0;
        for (InputSplit split : splits) {
            // Validate split
            Assert.assertTrue(this.isValidSplit(split, config));

            // Read split
            TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
            RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
            reader.initialize(split, taskContext);
            count += this.countTuples(reader);
        }
        Assert.assertEquals(expectedTuples, count);
    }

    /**
     * Determines whether an input split is valid
     * 
     * @param split
     *            Input split
     * @return True if a valid split, false otherwise
     */
    protected boolean isValidSplit(InputSplit split, Configuration config) {
        return split instanceof FileSplit;
    }

    /**
     * Indicates whether inputs can be split, defaults to true
     * 
     * @return Whether inputs can be split
     */
    protected boolean canSplitInputs() {
        return true;
    }

    /**
     * Tests for input splitting
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void split_input_01() throws IOException, InterruptedException {
        Assume.assumeTrue(this.canSplitInputs());

        Configuration config = this.prepareConfiguration();
        config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false);
        Assert.assertEquals(Integer.MAX_VALUE, config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE));
        this.testSplitInputs(config, new File[] { small }, 100, SMALL_SIZE);
    }

    /**
     * Tests for input splitting
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void split_input_02() throws IOException, InterruptedException {
        Assume.assumeTrue(this.canSplitInputs());

        Configuration config = this.prepareConfiguration();
        config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false);
        config.setLong(NLineInputFormat.LINES_PER_MAP, 10);
        Assert.assertEquals(Integer.MAX_VALUE, config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE));
        this.testSplitInputs(config, new File[] { small }, 10, SMALL_SIZE);
    }

    /**
     * Tests for input splitting
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    @Test
    public final void split_input_03() throws IOException, InterruptedException {
        Assume.assumeTrue(this.canSplitInputs());

        Configuration config = this.prepareConfiguration();
        config.setBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, false);
        config.setLong(NLineInputFormat.LINES_PER_MAP, 100);
        Assert.assertEquals(Integer.MAX_VALUE, config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE));
        this.testSplitInputs(config, new File[] { large }, 100, LARGE_SIZE);
    }
}