package edu.emory.clir.clearnlp.lexicon.dbpedia;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.gson.Gson;
import edu.emory.clir.clearnlp.util.CharUtils;
import edu.emory.clir.clearnlp.util.IOUtils;
import edu.emory.clir.clearnlp.util.PatternUtils;
import edu.emory.clir.clearnlp.util.StringUtils;
import edu.emory.clir.clearnlp.util.constant.PatternConst;
import edu.emory.clir.clearnlp.util.constant.StringConst;
public class DBPediaInfoExtractor implements DBPediaXML
{
static final Pattern RESOURCE = Pattern.compile("<http://dbpedia.org/resource/(.+?)>");
static final Pattern ONTOLOGY = Pattern.compile("<http://dbpedia.org/ontology/(.+?)>");
public DBPediaInfoMap getInfoMap(DBPediaTypeMap typeMap, InputStream in) throws Exception
{
BufferedReader reader = IOUtils.createBufferedReader(in);
DBPediaInfoMap map = new DBPediaInfoMap();
String line, title, type;
DBPediaInfo info;
Matcher m;
while ((line = reader.readLine()) != null)
{
m = RESOURCE.matcher(line);
if (!m.find()) continue;
title = m.group(1);
m = ONTOLOGY.matcher(line);
if (!m.find()) continue;
type = m.group(1);
if (!type.startsWith("Wikidata"))
map.computeIfAbsent(title, k -> new DBPediaInfo()).addType(DBPediaType.getType(type));
}
for (Entry<String,DBPediaInfo> e : map.entrySet())
{
info = e.getValue();
trimInstanceTypes(typeMap, info.getTypes());
info.addAlias(getAlias(e.getKey()));
}
return map;
}
private void trimInstanceTypes(DBPediaTypeMap typeMap, Set<DBPediaType> set)
{
List<DBPediaType> list = new ArrayList<>(set);
Set<DBPediaType> remove = new HashSet<>();
int i, j, size = list.size();
DBPediaType ti, tj;
for (i=1; i<size; i++)
{
ti = list.get(i);
for (j=0; j<i; j++)
{
tj = list.get(j);
if (typeMap.isSuperType(ti, tj))
remove.add(tj);
else if (typeMap.isSuperType(tj, ti))
remove.add(ti);
}
}
set.removeAll(remove);
}
private String getAlias(String s)
{
if (StringUtils.containsPunctuation(s) || StringUtils.containsUpperCaseOnly(s))
return PatternUtils.replaceAll(PatternConst.UNDERSCORE, s, StringConst.SPACE);
StringBuilder build = new StringBuilder();
char[] cs = s.toCharArray();
int i, len = cs.length;
for (i=0; i<len; i++)
{
if (0 < i&&i < len-1 && CharUtils.isLowerCase(cs[i-1]) && CharUtils.isUpperCase(cs[i]))
build.append(StringConst.SPACE);
build.append(cs[i]);
}
return build.toString();
}
public void addRedirects(Map<String,DBPediaInfo> infoMap, InputStream in) throws Exception
{
BufferedReader reader = IOUtils.createBufferedReader(in);
String line, redirect, title;
DBPediaInfo info;
Matcher m;
while ((line = reader.readLine()) != null)
{
m = RESOURCE.matcher(line);
if (!m.find()) continue;
redirect = m.group(1);
if (!m.find()) continue;
title = m.group(1);
if ((info = infoMap.get(title)) != null)
info.addAlias(getAlias(redirect));
}
}
static public void main(String[] args) throws Exception
{
DBPediaInfoExtractor ex = new DBPediaInfoExtractor();
Gson gson = new Gson();
DBPediaTypeMap typeMap = gson.fromJson(new InputStreamReader(IOUtils.createXZBufferedInputStream(args[0])), DBPediaTypeMap.class); // dbpedia.owl.json.xz
DBPediaInfoMap infoMap = ex.getInfoMap(typeMap, IOUtils.createXZBufferedInputStream(args[1])); // instance_types_en.nt.xz
// ex.addRedirects(infoMap, IOUtils.createXZBufferedInputStream(args[2])); // redirects_en.ttl.xz
PrintStream out = new PrintStream(IOUtils.createXZBufferedOutputStream(args[3])); // instances_en.json.xz
out.print(gson.toJson(infoMap));
out.close();
System.out.println(infoMap.get("Abraham_Lincoln").getAliases());
}
}