package edu.isi.karma.mapreduce.tripleparser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TripleMapper extends Mapper<Text, Text, Text, NullWritable> {
// MultipleOutputs<Text,Text> namedOutput;
// private static String typeUri;
@Override
public void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
// if (typeUri == null || typeUri.trim().isEmpty())
// return;
String triple = key.toString() + value.toString();
if (triple.endsWith("@en .")) {
triple = triple.substring(0, triple.length() - 5) + " .";
}
List<String> list = new ArrayList<>();
Matcher m = Pattern.compile("([^\"]\\S*|\".+?\")\\s*").matcher(triple);
while (m.find())
list.add(m.group(1));
if (list.size() != 4 && list.size() != 5)
return;
// skip other languages
// if ((list.size() == 5))
// if (!list.get(3).startsWith("@en"))
// return;
String subject = list.get(0);
String predicate = list.get(1);
String object = list.get(2);
if (subject.trim().length() == 0 ||
predicate.trim().length() == 0 ||
object.trim().length() == 0)
return;
if (!subject.startsWith("<") || !subject.endsWith(">")) {
return;
}
if (!predicate.startsWith("<") || !predicate.endsWith(">")) {
return;
}
subject = subject.substring(1, subject.length() - 1);
subject = subject.replace("\t", "");
predicate = predicate.substring(1, predicate.length() - 1);
predicate = predicate.replace("\t", "");
if (object.startsWith("<") && object.endsWith(">")) // object is uri
object = object.substring(1, object.length() - 1);
else {
object = object.replace("\t", "");
object = object.replaceAll("\\\\", "");
object = "Literal:" + object;
}
// String guid = new RandomGUID().toString();
// if (!object.startsWith("<") || !object.endsWith(">")) { // object is literal
// object = "<" + guid + ">///" + object;
// } else {
// object = object.substring(1, object.length() - 1);
// }
String keyStr = subject + "|||" + predicate + "|||" + object;
context.write(new Text(keyStr), NullWritable.get());
}
// @Override
// protected void setup(Context context) throws IOException, InterruptedException {
// super.setup(context);
//// namedOutput = new MultipleOutputs<Text,Text>(context);
// Configuration conf = context.getConfiguration();
//// typeUri = conf.get("typeUri");
// }
}