JobFactory.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 *     
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.hadoop.rdf.stats.jobs;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.jena.hadoop.rdf.io.input.QuadsInputFormat;
import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
import org.apache.jena.hadoop.rdf.io.input.TriplesOrQuadsInputFormat;
import org.apache.jena.hadoop.rdf.io.input.nquads.NQuadsInputFormat;
import org.apache.jena.hadoop.rdf.io.input.ntriples.NTriplesInputFormat;
import org.apache.jena.hadoop.rdf.io.output.nquads.NQuadsOutputFormat;
import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesOutputFormat;
import org.apache.jena.hadoop.rdf.mapreduce.KeyMapper;
import org.apache.jena.hadoop.rdf.mapreduce.RdfMapReduceConstants;
import org.apache.jena.hadoop.rdf.mapreduce.TextCountReducer;
import org.apache.jena.hadoop.rdf.mapreduce.characteristics.CharacteristicSetReducer;
import org.apache.jena.hadoop.rdf.mapreduce.characteristics.QuadCharacteristicSetGeneratingReducer;
import org.apache.jena.hadoop.rdf.mapreduce.characteristics.TripleCharacteristicSetGeneratingReducer;
import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
import org.apache.jena.hadoop.rdf.mapreduce.count.QuadNodeCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.QuadDataTypeCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.TripleDataTypeCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.QuadNamespaceCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.TripleNamespaceCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadGraphCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadObjectCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.count.positional.TripleObjectCountMapper;
import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.QuadFilterByPredicateMapper;
import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.TripleFilterByPredicateUriMapper;
import org.apache.jena.hadoop.rdf.mapreduce.group.QuadGroupBySubjectMapper;
import org.apache.jena.hadoop.rdf.mapreduce.group.TripleGroupBySubjectMapper;
import org.apache.jena.hadoop.rdf.mapreduce.transform.TriplesToQuadsConstantGraphMapper;
import org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable;
import org.apache.jena.hadoop.rdf.types.NodeWritable;
import org.apache.jena.hadoop.rdf.types.QuadWritable;
import org.apache.jena.hadoop.rdf.types.TripleWritable;
import org.apache.jena.vocabulary.RDF ;

/**
 * Factory that can produce {@link Job} instances for computing various RDF
 * statistics
 * 
 * 
 * 
 */
public class JobFactory {

    /**
     * Private constructor prevents instantiation
     */
    private JobFactory() {
    }

    /**
     * Gets a job for computing node counts on RDF triple inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getTripleNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Triples Node Usage Count");

        // Map/Reduce classes
        job.setMapperClass(TripleNodeCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(TriplesInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Gets a job for computing node counts on RDF quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getQuadNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Quads Node Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadNodeCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(QuadsInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Gets a job for computing node counts on RDF triple and/or quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Node Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadNodeCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }
    
    public static Job getTripleGraphSizesJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Triples Graph Sizes");
        
        // Map/Reduce classes
        ChainMapper.addMapper(job, TriplesToQuadsConstantGraphMapper.class, LongWritable.class, TripleWritable.class, LongWritable.class, QuadWritable.class, config);
        ChainMapper.addMapper(job, QuadGraphCountMapper.class, LongWritable.class, QuadWritable.class, NodeWritable.class, LongWritable.class, config);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);
        
        // Input and Output
        job.setInputFormatClass(TriplesInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        
        return job;
    }
    
    public static Job getQuadGraphSizesJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Quads Graph Sizes");
        
        // Map/Reduce classes
        job.setMapperClass(QuadGraphCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);
        
        // Input and Output
        job.setInputFormatClass(QuadsInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        
        return job;
    }
    
    public static Job getGraphSizesJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Graph Sizes");
        
        // Map/Reduce classes
        job.setMapperClass(QuadGraphCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);
        
        // Input and Output
        job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        
        return job;
    }

    /**
     * Gets a sequence of jobs that can be used to compute characteristic sets
     * for RDF triples
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param intermediateOutputPath
     *            Intermediate output path
     * @param outputPath
     *            Final output path
     * @return Sequence of jobs
     * @throws IOException
     */
    public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
            String outputPath) throws IOException {
        Job[] jobs = new Job[2];

        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Triples Characteristic Set (Generation)");

        // Map/Reduce classes
        job.setMapperClass(TripleGroupBySubjectMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(TripleWritable.class);
        job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class);
        job.setOutputKeyClass(CharacteristicSetWritable.class);
        job.setOutputValueClass(NullWritable.class);

        // Input and Output
        job.setInputFormatClass(TriplesInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
        SequenceFileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

        jobs[0] = job;

        job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Triples Characteristic Set (Reduction)");

        // Map/Reduce classes
        job.setMapperClass(KeyMapper.class);
        job.setMapOutputKeyClass(CharacteristicSetWritable.class);
        job.setMapOutputValueClass(CharacteristicSetWritable.class);
        job.setReducerClass(CharacteristicSetReducer.class);
        job.setOutputKeyClass(CharacteristicSetWritable.class);
        job.setOutputValueClass(CharacteristicSetWritable.class);

        // Input and Output
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, intermediateOutputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        jobs[1] = job;
        return jobs;
    }

    /**
     * Gets a sequence of jobs that can be used to compute characteristic sets
     * for RDF quads
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param intermediateOutputPath
     *            Intermediate output path
     * @param outputPath
     *            Final output path
     * @return Sequence of jobs
     * @throws IOException
     */
    public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
            String outputPath) throws IOException {
        Job[] jobs = new Job[2];

        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Quads Characteristic Set (Generation)");

        // Map/Reduce classes
        job.setMapperClass(QuadGroupBySubjectMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(QuadWritable.class);
        job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
        job.setOutputKeyClass(CharacteristicSetWritable.class);
        job.setOutputValueClass(NullWritable.class);

        // Input and Output
        job.setInputFormatClass(QuadsInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
        SequenceFileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

        jobs[0] = job;

        job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Quads Characteristic Set (Reduction)");

        // Map/Reduce classes
        job.setMapperClass(KeyMapper.class);
        job.setMapOutputKeyClass(CharacteristicSetWritable.class);
        job.setMapOutputValueClass(CharacteristicSetWritable.class);
        job.setReducerClass(CharacteristicSetReducer.class);
        job.setOutputKeyClass(CharacteristicSetWritable.class);
        job.setOutputValueClass(CharacteristicSetWritable.class);

        // Input and Output
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, intermediateOutputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        jobs[1] = job;
        return jobs;
    }

    /**
     * Gets a sequence of jobs that can be used to compute characteristic sets
     * for RDF triple and/or quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param intermediateOutputPath
     *            Intermediate output path
     * @param outputPath
     *            Final output path
     * @return Sequence of jobs
     * @throws IOException
     */
    public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
            String outputPath) throws IOException {
        Job[] jobs = new Job[2];

        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Characteristic Set (Generation)");

        // Map/Reduce classes
        job.setMapperClass(QuadGroupBySubjectMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(QuadWritable.class);
        job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class);
        job.setOutputKeyClass(CharacteristicSetWritable.class);
        job.setOutputValueClass(NullWritable.class);

        // Input and Output
        job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));
        SequenceFileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

        jobs[0] = job;

        job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Characteristic Set (Reduction)");

        // Map/Reduce classes
        job.setMapperClass(KeyMapper.class);
        job.setMapOutputKeyClass(CharacteristicSetWritable.class);
        job.setMapOutputValueClass(CharacteristicSetWritable.class);
        job.setReducerClass(CharacteristicSetReducer.class);
        job.setOutputKeyClass(CharacteristicSetWritable.class);
        job.setOutputValueClass(CharacteristicSetWritable.class);

        // Input and Output
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, intermediateOutputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        jobs[1] = job;
        return jobs;
    }

    /**
     * Gets a job for computing type counts on RDF triple inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param intermediateOutputPath
     *            Path for intermediate output which will be all the type
     *            declaration triples present in the inputs
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job[] getTripleTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
            String outputPath) throws IOException {
        Job[] jobs = new Job[2];

        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Type Triples Extraction");

        // Map/Reduce classes
        job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
        job.setMapperClass(TripleFilterByPredicateUriMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(TripleWritable.class);

        // Input and Output Format
        job.setInputFormatClass(TriplesInputFormat.class);
        job.setOutputFormatClass(NTriplesOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));

        jobs[0] = job;

        // Object Node Usage count job
        job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Triples Type Usage Count");

        // Map/Reduce classes
        job.setMapperClass(TripleObjectCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(NTriplesInputFormat.class);
        NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
                                                          // better if this was
                                                          // intelligently
                                                          // configured
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, intermediateOutputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        jobs[1] = job;

        return jobs;
    }

    /**
     * Gets a job for computing type counts on RDF quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param intermediateOutputPath
     *            Path for intermediate output which will be all the type
     *            declaration quads present in the inputs
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job[] getQuadTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
            String outputPath) throws IOException {
        Job[] jobs = new Job[2];

        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Type Quads Extraction");

        // Map/Reduce classes
        job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
        job.setMapperClass(QuadFilterByPredicateMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(QuadWritable.class);

        // Input and Output Format
        job.setInputFormatClass(QuadsInputFormat.class);
        job.setOutputFormatClass(NQuadsOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));

        jobs[0] = job;

        // Object Node Usage count job
        job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Quads Type Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadObjectCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(NQuadsInputFormat.class);
        NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
                                                          // better if this was
                                                          // intelligently
                                                          // configured
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, intermediateOutputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        jobs[1] = job;

        return jobs;
    }

    /**
     * Gets a job for computing type counts on RDF triple and/or quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param intermediateOutputPath
     *            Path for intermediate output which will be all the type
     *            declaration quads present in the inputs
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job[] getTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath,
            String outputPath) throws IOException {
        Job[] jobs = new Job[2];

        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Type Extraction");

        // Map/Reduce classes
        job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI());
        job.setMapperClass(QuadFilterByPredicateMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(QuadWritable.class);

        // Input and Output Format
        job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
        job.setOutputFormatClass(NQuadsOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath));

        jobs[0] = job;

        // Object Node Usage count job
        job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Type Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadObjectCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(NQuadsInputFormat.class);
        NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
                                                          // better if this was
                                                          // intelligently
                                                          // configured
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, intermediateOutputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        jobs[1] = job;

        return jobs;
    }

    /**
     * Gets a job for computing literal data type counts on RDF triple inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getTripleDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Triples Literal Data Type Usage Count");

        // Map/Reduce classes
        job.setMapperClass(TripleDataTypeCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(TriplesInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Gets a job for computing literal data type counts on RDF quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getQuadDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Quads Literal Data Type Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadDataTypeCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(QuadsInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Gets a job for computing literal data type counts on RDF triple and/or
     * quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Literal Data Type Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadDataTypeCountMapper.class);
        job.setMapOutputKeyClass(NodeWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(NodeCountReducer.class);

        // Input and Output
        job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
        job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Gets a job for computing literal data type counts on RDF triple inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getTripleNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Triples Namespace Usage Count");

        // Map/Reduce classes
        job.setMapperClass(TripleNamespaceCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(TextCountReducer.class);

        // Input and Output
        job.setInputFormatClass(TriplesInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Gets a job for computing literal data type counts on RDF quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getQuadNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Quads Namespace Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadNamespaceCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(TextCountReducer.class);

        // Input and Output
        job.setInputFormatClass(QuadsInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }

    /**
     * Gets a job for computing literal data type counts on RDF triple and/or
     * quad inputs
     * 
     * @param config
     *            Configuration
     * @param inputPaths
     *            Input paths
     * @param outputPath
     *            Output path
     * @return Job
     * @throws IOException
     */
    public static Job getNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException {
        Job job = Job.getInstance(config);
        job.setJarByClass(JobFactory.class);
        job.setJobName("RDF Namespace Usage Count");

        // Map/Reduce classes
        job.setMapperClass(QuadNamespaceCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(TextCountReducer.class);

        // Input and Output
        job.setInputFormatClass(TriplesOrQuadsInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        return job;
    }
}