package io.lumify.dbpedia.mapreduce.model;
import io.lumify.core.exception.LumifyException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LineData {
private static final Pattern LINE_PATTERN = Pattern.compile("^<(.*?)> <(.*?)> (.*) \\.$");
private final String pageUrl;
private final String propertyIri;
private final String valueRaw;
private final Value value;
private final String pageTitle;
public LineData(String pageUrl, String pageTitle, String propertyIri, String valueRaw, Value value) {
this.pageUrl = pageUrl;
this.pageTitle = pageTitle;
this.propertyIri = propertyIri;
this.valueRaw = valueRaw;
this.value = value;
}
public String getPageTitle() {
return pageTitle;
}
public String getPropertyIri() {
return propertyIri;
}
public Value getValue() {
return value;
}
// <http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/diseasesdb> "1142"@en .
public static LineData parse(String line) {
Matcher m = LINE_PATTERN.matcher(line);
if (!m.matches()) {
throw new LumifyException("Could not find match for line: " + line);
}
String pageUrl = m.group(1);
String propertyIri = m.group(2);
String valueRaw = m.group(3);
Value value = Value.parse(valueRaw);
String pageTitle = parsePageTitleFromPageUrl(pageUrl);
return new LineData(pageUrl, pageTitle, propertyIri, valueRaw, value);
}
public static String parsePageTitleFromPageUrl(String pageUrl) {
int lastSlash = pageUrl.lastIndexOf('/');
if (lastSlash < 0) {
throw new LumifyException("Could not parse page title from page url: " + pageUrl);
}
String pageTitle = pageUrl.substring(lastSlash + 1);
pageTitle = pageTitle.replace('_', ' ');
return pageTitle;
}
}