package com.yahoo.glimmer.indexing.preprocessor;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.parser.NxParser;
import org.semanticweb.yars.nx.parser.ParseException;
/**
* Maps each input line containing a tuple of 3 or more elements to Key/Value
* pairs of the following form KEY VALUE "subject"
* "<predicate> <object> <context> ." "predicate "PREDICATE"
* "object" "OBJECT" "context" "CONTEXT"
*
* If the object is a literal no key/value with a value of "OBJECT" is written.
*
* Eg. for the tuple
* "<http://subject/> <http://predicate/> <http://object/> <http://context/> ."
*
* KEY VALUE http://subject/ <http://predicate/> <http://object/>
* <http://context/> . http://predicate/ PREDICATE http://object/ OBJECT
* http://context/ CONTEXT
*
*/
public class TuplesToResourcesMapper extends Mapper<LongWritable, Text, Text, Object> {
private static final Log LOG = LogFactory.getLog(TuplesToResourcesMapper.class);
private static final int MAX_NODES = 5; // Our Any23 extractions include a 5
// Literal which is the extractor
// used.
public static final String INCLUDE_CONTEXTS_KEY = "includeContexts";
public static final String EXTRA_RESOURCES = "extraResources";
enum Counters {
NX_PARSER_EXCEPTION, NX_PARSER_RETRY_EXCEPTION, LONG_TUPLE, LONG_TUPLES, SHORT_TUPLE, LONG_TUPLE_ELEMENT, INVALID_RESOURCE, UNEXPECTED_SUBJECT_TYPE, UNEXPECTED_PREDICATE_TYPE, UNEXPECTED_CONTEXT_TYPE, WRITTEN_RESOURCES_CACHE_HIT
}
public static enum TupleElementName {
SUBJECT, PREDICATE, OBJECT, CONTEXT;
}
private boolean includeContexts = true;
private StringBuilder predicateObjectContextDot = new StringBuilder();
private Tuple tuple = new Tuple();
private TupleFilter filter;
private String[] extraResources;
private InputSplit lastInputSplit;
public void setFilter(TupleFilter filter) {
this.filter = filter;
}
protected void setup(Mapper<LongWritable, Text, Text, Object>.Context context) throws java.io.IOException, InterruptedException {
Configuration conf = context.getConfiguration();
boolean includeContexts = conf.getBoolean(INCLUDE_CONTEXTS_KEY, true);
setIncludeContexts(includeContexts);
TupleFilter filter = TupleFilterSerializer.deserialize(conf);
if (filter != null) {
LOG.info("Using TupleFilter:\n" + filter.toString());
setFilter(filter);
} else {
LOG.info("No TupleFilter given. Processing all tuples.");
}
extraResources = conf.getStrings(EXTRA_RESOURCES);
};
public void setIncludeContexts(boolean includeContexts) {
this.includeContexts = includeContexts;
}
@Override
protected void map(LongWritable key, Text valueText, Mapper<LongWritable, Text, Text, Object>.Context context) throws java.io.IOException,
InterruptedException {
if (extraResources != null && context.getTaskAttemptID().getTaskID().getId() == 0) {
// Add extra resources.
// These end up in the 'all' resources file so get given a Doc ID
// even if they don't occur in the data.
for (String extraResource : extraResources) {
context.write(new Text(extraResource), new Text(""));
}
extraResources = null;
}
if (!context.getInputSplit().equals(lastInputSplit)) {
lastInputSplit = context.getInputSplit();
if (lastInputSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) lastInputSplit;
LOG.info("Current FileSplit " + fileSplit.getPath().toString() + " start(length) bytes " + fileSplit.getStart() + "(" + fileSplit.getLength()
+ ")");
} else {
LOG.info("Current InputSplit " + lastInputSplit.toString());
}
}
String value = valueText.toString().trim();
if (value.isEmpty()) {
return;
}
Node[] nodes;
try {
nodes = NxParser.parseNodes(value);
} catch (ParseException e) {
// NxParser 1.2.2 has problems with typed literals like:
// "27"^^<int uri>. This is fixed in 1.2.3
context.getCounter(Counters.NX_PARSER_EXCEPTION).increment(1l);
String s = value.replaceAll("\\^\\^<[^>]+>", "");
try {
nodes = NxParser.parseNodes(s);
LOG.info("Only parsed after remove of literal types:" + value);
} catch (ParseException e1) {
context.getCounter(Counters.NX_PARSER_RETRY_EXCEPTION).increment(1l);
LOG.info("Failed parsing even after remove of literal types:" + value);
return;
}
}
if (nodes.length < 3) {
context.getCounter(Counters.SHORT_TUPLE).increment(1l);
LOG.info("Line parsed with less than 3 nodes at position" + key.toString());
return;
}
if (nodes.length > MAX_NODES) {
context.getCounter(Counters.LONG_TUPLE).increment(1l);
LOG.info("Line parsed with more than " + MAX_NODES + " nodes at position" + key.toString());
return;
}
for (TupleElementName name : TupleElementName.values()) {
TupleElement element = tuple.getElement(name);
if (nodes.length > name.ordinal()) {
Node node = nodes[name.ordinal()];
String text = node.toString();
if (text.length() > 5000) {
System.out.println("Long tuple element " + name.name() + ". Length:" + text.length() + " starting with " + text.substring(0, 100));
context.getCounter(Counters.LONG_TUPLE_ELEMENT).increment(1);
return;
}
element.type = TupleElement.Type.valueOf(node.getClass().getSimpleName().toUpperCase());
if (element.type == TupleElement.Type.RESOURCE) {
try {
new URI(text);
} catch (URISyntaxException e) {
context.getCounter(Counters.INVALID_RESOURCE).increment(1l);
LOG.info("Bad resource near position " + key.toString());
return;
}
}
element.text = text;
element.n3 = node.toN3();
} else {
element.type = null;
element.text = null;
element.n3 = null;
}
}
if (filter != null) {
if (!filter.filter(tuple)) {
// Skip tuple.
return;
}
}
predicateObjectContextDot.setLength(0);
if (!tuple.subject.isOfType(TupleElement.Type.RESOURCE, TupleElement.Type.BNODE)) {
context.getCounter(Counters.UNEXPECTED_SUBJECT_TYPE).increment(1l);
return;
}
Text subject = new Text(tuple.subject.text);
if (!tuple.predicate.isOfType(TupleElement.Type.RESOURCE)) {
context.getCounter(Counters.UNEXPECTED_PREDICATE_TYPE).increment(1l);
return;
}
context.write(new Text(tuple.predicate.text), new Text(TupleElementName.PREDICATE.name()));
predicateObjectContextDot.append(tuple.predicate.n3);
if (tuple.object.isOfType(TupleElement.Type.RESOURCE, TupleElement.Type.BNODE)) {
context.write(new Text(tuple.object.text), new Text(TupleElementName.OBJECT.name()));
}
predicateObjectContextDot.append(' ');
predicateObjectContextDot.append(tuple.object.n3);
if (includeContexts && tuple.context.text != null) {
if (tuple.context.isOfType(TupleElement.Type.RESOURCE)) {
context.write(new Text(tuple.context.text), new Text(TupleElementName.CONTEXT.name()));
predicateObjectContextDot.append(' ');
predicateObjectContextDot.append(tuple.context.n3);
} else {
context.getCounter(Counters.UNEXPECTED_CONTEXT_TYPE).increment(1l);
}
}
predicateObjectContextDot.append(" .");
if (predicateObjectContextDot.length() > 10000) {
System.out.println("Long tuple. Length:" + predicateObjectContextDot.length() + " starting with " + predicateObjectContextDot.substring(0, 100));
context.getCounter(Counters.LONG_TUPLES).increment(1);
} else {
// Write subject with predicate, object, context as value
context.write(subject, new Text(predicateObjectContextDot.toString()));
}
}
}