package com.ontology2.bakemono.freebasePrefilter;
import com.google.common.base.*;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.ontology2.bakemono.abstractions.Codec;
import com.ontology2.bakemono.primitiveTriples.*;
import com.ontology2.rdf.InvalidNodeException;
import com.ontology2.rdf.InvalidPrefixException;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.List;
import static com.google.common.base.Functions.*;
public class FreebaseRDFMapper extends Mapper<LongWritable,Text,Text,Text> {
private static org.apache.commons.logging.Log logger = LogFactory.getLog(FreebaseRDFMapper.class);
ImmutableMap.Builder<String,String> prefixBuilder=new ImmutableMap.Builder<String,String>();
ImmutableMap<String,String> prefixMap = ImmutableMap.of();
Codec<PrimitiveTriple> ptCodec=new PrimitiveTripleCodec();
private Predicate<PrimitiveTriple> tripleFilter;
private Function<PrimitiveTriple, PrimitiveTriple> rewritingFunction;
public void declarePrefix(String obj) {
if(obj.startsWith("@prefix")) {
try {
List<String> parts=splitPrefixDeclaration(obj);
if(!prefixMap.containsKey(parts.get(1))) {
prefixBuilder.put(parts.get(1),parts.get(2));
prefixMap=prefixBuilder.build();
}
} catch(InvalidPrefixException ex) {
logger.warn("Invalid prefix declaration: "+obj);
return;
}
}
}
@Override
public void setup(Context job) {
declarePrefix("@prefix ns: <http://rdf.freebase.com/ns/>.");
declarePrefix("@prefix key: <http://rdf.freebase.com/key/>.");
declarePrefix("@prefix owl: <http://www.w3.org/2002/07/owl#>.");
declarePrefix("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.");
declarePrefix("@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.");
declarePrefix("@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.");
tripleFilter=acceptTheseTriples();
rewritingFunction=tripleRewritingFunction();
}
final static Splitter lineSplitter = Splitter.on(CharMatcher.WHITESPACE).omitEmptyStrings().limit(3);
final static Splitter iriSplitter = Splitter.on(":").limit(2);
@Override
public void map(LongWritable k, Text v,Context c) throws IOException, InterruptedException {
String line=v.toString();
if (line.startsWith("@prefix")) {
incrementCounter(c,FreebasePrefilterCounter.PREFIX_DECL,1L);
return;
}
try {
List<String> parts = expandTripleParts(line);
line.getBytes();
PrimitiveTriple triple=new PrimitiveTriple(parts.get(0),parts.get(1),parts.get(2));
if(tripleFilter.apply(triple)) {
triple=rewritingFunction.apply(triple);
accept(c,triple);
incrementCounter(c,FreebasePrefilterCounter.ACCEPTED,1L);
} else {
incrementCounter(c,FreebasePrefilterCounter.IGNORED,1L);
}
} catch(InvalidNodeException ex) {
incrementCounter(c,FreebasePrefilterCounter.IGNORED,1L);
logger.warn("Invalid triple: "+line);
}
return;
}
private void incrementCounter(Context context,Enum <?> counterId,long amount) {
Counter counter=context.getCounter(counterId);
if(counter!=null) {
counter.increment(amount);
};
};
private void accept(Context out,
PrimitiveTriple primitiveTriple) throws IOException, InterruptedException {
out.write(new Text(primitiveTriple.getSubject()), new Text(primitiveTriple.poPairAsString()));
}
List<String> expandTripleParts(String line) throws InvalidNodeException {
List<String> parts=splitTriple(line);
parts.set(0,rewriteNode(expandIRINode(parts.get(0))));
parts.set(1,rewriteNode(expandIRINode(parts.get(1))));
parts.set(2,rewriteNode(expandAnyNode(parts.get(2).trim())));
return parts;
}
static List<String> splitTriple(String obj) throws InvalidNodeException {
if (!obj.endsWith(".")) {
throw new InvalidNodeException();
}
obj=obj.substring(0,obj.length()-1);
List<String> parts=Lists.newArrayList(lineSplitter.split(obj));
if (parts.size()!=3) {
throw new InvalidNodeException();
}
return parts;
}
public String expandIRINode(String string) throws InvalidNodeException {
List<String> parts=Lists.newArrayList(iriSplitter.split(string));
if (string.startsWith("<") && string.endsWith(">")) {
return string;
}
if (prefixMap.containsKey(parts.get(0))) {
return "<"+prefixMap.get(parts.get(0))+parts.get(1)+">";
}
throw new InvalidNodeException();
}
public String expandAnyNode(String string) {
List<String> parts=Lists.newArrayList(iriSplitter.split(string));
if (string.startsWith("<") && string.endsWith(">")) {
return string;
}
if (prefixMap.containsKey(parts.get(0))) {
return "<"+prefixMap.get(parts.get(0))+parts.get(1)+">";
}
return string;
}
public String rewriteNode(String uri) {
if(!uri.startsWith("<") && uri.endsWith(">")) {
return uri;
}
if(uri.startsWith("<http://rdf.freebase.com/")) {
uri="<http://rdf.basekb.com/"+uri.substring("<http://rdf.freebase.com/".length());
}
return uri;
};
public static List<String> splitPrefixDeclaration(String obj) throws InvalidPrefixException {
List<String> parts=Lists.newArrayList(Splitter.on(" ").split(obj));
if (parts.size()!=3) {
throw new InvalidPrefixException();
}
String prefix=parts.get(1);
String mapsTo=parts.get(2);
if (!prefix.endsWith(":")) {
throw new InvalidPrefixException();
}
parts.set(1, prefix.substring(0, prefix.length()-1));
if (!mapsTo.startsWith("<") || !mapsTo.endsWith(">.")) {
throw new InvalidPrefixException();
}
parts.set(2, mapsTo.substring(1, mapsTo.length()-2));
return parts;
}
public static Predicate <PrimitiveTriple> acceptTheseTriples() {
return Predicates.not(Predicates.or(
PrimitiveTriple.hasPredicate("<http://rdf.basekb.com/ns/type.type.instance>"),
PrimitiveTriple.hasPredicate("<http://rdf.basekb.com/ns/type.type.expected_by>"),
PrimitiveTriple.hasPredicate("<http://rdf.basekb.com/ns/common.notable_for.display_name>"),
Predicates.and(
PrimitiveTriple.hasPredicate("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"),
PrimitiveTriple.objectMatchesPrefix("<http://rdf.basekb.com")
)
));
}
public static Function<PrimitiveTriple, PrimitiveTriple> tripleRewritingFunction() {
return compose(compose(
new PrimitiveTripleReverser(
"<http://rdf.basekb.com/ns/type.permission.controls>"
, "<http://rdf.basekb.com/ns/m.0j2r9sk>")
, new PrimitiveTripleReverser(
"<http://rdf.basekb.com/ns/dataworld.gardening_hint.replaced_by>"
, "<http://rdf.basekb.com/ns/m.0j2r8t8>"))
, new PrimitiveTriplePredicateRewriter(
"<http://rdf.basekb.com/ns/type.object.type>",
"<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"));
}
}