package com.yahoo.glimmer.indexing.preprocessor;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.semanticweb.owlapi.model.OWLClass;
import org.semanticweb.owlapi.model.OWLOntology;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.yahoo.glimmer.indexing.OntologyLoader;
public class PrepTool extends Configured implements Tool {
private static final int DEFAULT_REDUCER_COUNT = 1;
public static final String NO_CONTEXTS_ARG = "excludeContexts";
private static final String ONTOLOGY_ARG = "ontology";
private static final String REDUCER_COUNT_ARG = "reducers";
private static final String OUTPUT_ARG = "output";
private static final String INPUT_ARG = "input";
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new PrepTool(), args);
System.exit(ret);
}
@Override
public int run(String[] args) throws Exception {
SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer", new Parameter[] {
new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG, "Don't process the contexts for each tuple."),
new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O', ONTOLOGY_ARG),
new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG),
new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the input data."),
new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the out data."), });
JSAPResult jsapResult = jsap.parse(args);
if (!jsapResult.success()) {
System.err.print(jsap.getUsage());
System.exit(1);
}
Configuration config = getConf();
boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts);
// The ontology if any...
String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG);
if (ontologyFilename != null) {
// Load the ontology
InputStream ontologyInputStream = new FileInputStream(ontologyFilename);
OWLOntology ontology = OntologyLoader.load(ontologyInputStream);
System.out.println("Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms.");
ArrayList<String> ontologyClasses = new ArrayList<String>();
for (OWLClass owlClass : ontology.getClassesInSignature()) {
ontologyClasses.add(owlClass.getIRI().toString());
}
System.out.println("Adding " + ontologyClasses.size() + " classes from ontology.");
config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0]));
} else {
System.out.println("No ontology filename set in conf. No ontology has been loaded.");
}
Job job = Job.getInstance(config);
job.setJarByClass(PrepTool.class);
job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis());
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(TuplesToResourcesMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT);
job.setNumReduceTasks(reducerCount);
if (reducerCount == 1) {
// We assign 'global' ids in the reducer. For this to work, there
// can be only one. But using just one reducer, we run out of local disk space during the
// pre-reduce merge with big data sets like WCC.
job.setReducerClass(ResourcesReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Object.class);
job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class);
} else {
/*
* TODO: Take the functionality of the reducer and move it to run on
* the gateway. We then use n identity reducers, the output of which
* will be read and merged as streams on the gateway.
*/
}
FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG)));
Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG));
FileOutputFormat.setOutputPath(job, outputDir);
if (!job.waitForCompletion(true)) {
System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG));
return 1;
}
// IF THERE WAS ONLY ONE REDUCER WE NOW HAVE
// One file per reducer containing lists of urls(recourses) for
// subjects, predicates, objects and contexts.
// One file per reducer that contains all resources. subjects +
// predicates + objects + contexts.
// One file per reducer that contains the subjects + all <predicate>
// <object>|"Literal" <context> on that subject.
// IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO.
return 0;
}
}