/* Copyright 2013-2014 Fabian Steeg, hbz. Licensed under the Eclipse Public License 1.0 */
package org.lobid.lodmill.hadoop;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Properties;
import java.util.Scanner;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.lobid.lodmill.JsonLdConverter.Format;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Joiner;
import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
/**
* Collect subjects required for details on lobid triples.
*
* Maps lobid subjects URIs to subjects required to aquire details on the lobid
* triples, i.e. URIs and blank nodes used in object position of triples with
* lobid URIs in the subject position.
*
* @author Fabian Steeg (fsteeg)
*/
public class CollectSubjects implements Tool {
private static final int REDUCERS = 1;
private static final Logger LOG = LoggerFactory
.getLogger(CollectSubjects.class);
static final Configuration MAP_FILE_CONFIG = new Configuration();
static final String PREFIX_KEY = "target.subject.prefix";
private static final Properties PROPERTIES = load();
static final Set<String> TO_RESOLVE = props("resolve");
static final Set<String> PREDICATES = props("predicates");
static final Set<String> PARENTS = props("parents");
private static Properties load() {
final Properties props = new Properties();
try {
props.load(Thread.currentThread().getContextClassLoader()
.getResourceAsStream("resolve.properties"));
} catch (IOException e) {
LOG.error(e.getMessage(), e);
}
return props;
}
private static SortedSet<String> props(final String key) {
return new TreeSet<>(Arrays.asList(PROPERTIES.getProperty(key).split(";")));
}
/**
* @param args Generic command-line arguments passed to {@link ToolRunner}.
*/
public static void main(final String[] args) {
int res;
try {
res = ToolRunner.run(new CollectSubjects(), args);
System.exit(res);
} catch (Exception e) {
e.printStackTrace();
}
}
private Configuration conf;
@Override
public int run(String[] args) throws Exception {
if (args.length != 4) {
System.err
.println("Usage: CollectSubjects <input path> <output path> <target subjects prefix> <index name>");
System.exit(-1);
}
final String mapFileName = mapFileName(args[3]);
conf.setStrings("mapred.textoutputformat.separator", " ");
conf.setStrings("target.subject.prefix", args[2]);
conf.setStrings("map.file.name", mapFileName);
final Job job = Job.getInstance(conf);
job.setNumReduceTasks(REDUCERS);
job.setJarByClass(CollectSubjects.class);
job.setJobName("CollectSubjects");
FileInputFormat.addInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(CollectSubjectsMapper.class);
job.setReducerClass(CollectSubjectsReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
boolean success = job.waitForCompletion(true);
asMapFile(
FileSystem.get(conf),//
new Path(args[1] + "/part-r-00000"),
new Path(conf.get("map.file.name")));
System.exit(success ? 0 : 1);
return 0;
}
static void asMapFile(final FileSystem fs, final Path subjectMappingsPath,
final Path mapFilePath) throws IOException {
final Path mapData = fs.makeQualified(subjectMappingsPath);
final Path mapFile = fs.makeQualified(mapFilePath);
writeToMapFile(fs, mapData, mapFile);
LOG.info("Wrote map data to: " + mapFile);
}
private static void writeToMapFile(final FileSystem fs,
final Path subjectMappingsPath, final Path mapFilePath)
throws IOException {
try (final MapFile.Writer writer =
new MapFile.Writer(fs.getConf(), mapFilePath,
MapFile.Writer.keyClass(Text.class),
MapFile.Writer.valueClass(Text.class),
MapFile.Writer.compression(CompressionType.NONE));
final InputStream inputStream = fs.open(subjectMappingsPath);
final Scanner scanner = new Scanner(inputStream)) {
while (scanner.hasNextLine()) {
final String[] subjectAndValues = scanner.nextLine().split(" ");
writer.append(//
new Text(subjectAndValues[0].trim()),//
new Text(subjectAndValues[1].trim()));
}
}
}
/**
* Collect the object IDs to aquire details on a subject.
*
* @author Fabian Steeg (fsteeg)
*/
static final class CollectSubjectsMapper extends
Mapper<LongWritable, Text, Text, Text> {
private String prefix;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
prefix = context.getConfiguration().get(PREFIX_KEY);
}
@Override
public void map(final LongWritable key, final Text value,
final Context context) throws IOException, InterruptedException {
final String val = value.toString().trim();
if (val.isEmpty())
return;
final Triple triple = asTriple(val);
if (shouldProcess(triple)) {
final String subject = getSubject(val, triple, context.getInputSplit());
final String object = getObject(val, triple, context.getInputSplit());
LOG.debug(String.format(
"Collecting ID found in object position (%s) of subject (%s)",
object, subject));
context.write(new Text(object), new Text(subject));
}
}
private boolean shouldProcess(final Triple triple) {
return triple != null
&& triple.getSubject().isURI()
&& triple.getSubject().toString()
.startsWith(prefix == null ? "" : prefix)
&& !triple.getSubject().toString().endsWith("/about")
&& TO_RESOLVE.contains(triple.getPredicate().toString())
&& (triple.getObject().isBlank() || triple.getObject().isURI());
}
private static String getSubject(final String val, final Triple triple,
final InputSplit inputSplit) {
return triple.getSubject().isBlank() ? blankSubjectLabel(val, inputSplit)
: triple.getSubject().toString();
}
private static String getObject(final String val, final Triple triple,
final InputSplit inputSplit) {
return triple.getObject().isBlank() ? blankObjectLabel(val, inputSplit)
: triple.getObject().toString();
}
static Triple asTriple(final String val) {
try {
final Model model = ModelFactory.createDefaultModel();
model.read(new StringReader(val), null, Format.N_TRIPLE.getName());
return model.getGraph().find(Triple.ANY).next();
} catch (com.hp.hpl.jena.shared.SyntaxError e) {
LOG.warn(String.format("Could not parse triple '%s': %s, skipping",
val, e.getMessage()));
} catch (java.util.NoSuchElementException e1) {
LOG.warn(String.format("No triple '%s': %s, skipping", val,
e1.getMessage()));
}
return null;
}
}
/**
* Join the subjects required for details under the main subject.
*
* @author Fabian Steeg (fsteeg)
*/
static final class CollectSubjectsReducer extends
Reducer<Text, Text, Text, Text> {
@Override
public void reduce(final Text key, final Iterable<Text> values,
final Context context) throws IOException, InterruptedException {
context.write(key, new Text(Joiner.on(",").join(values)));
}
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
static String blankSubjectLabel(final String val, final InputSplit inputSplit) {
return val.substring(val.indexOf("_:"), val.indexOf(" ")).trim()
+ createBlankNodeSuffix(inputSplit);
}
static String blankObjectLabel(final String val, final InputSplit inputSplit) {
return val.substring(val.lastIndexOf("_:"), val.lastIndexOf(".")).trim()
+ createBlankNodeSuffix(inputSplit);
}
private static String createBlankNodeSuffix(final InputSplit inputSplit) {
return ":"
+ ((FileSplit) inputSplit).getPath().toUri().getPath()
.replaceAll("/user/[^/]+/", "");
}
/**
* @param prefix The prefix to use for making the map file name unique
* @return A file name for the map file, with the given prefix
*/
public static String mapFileName(String prefix) {
return prefix + "-subjects.map";
}
}