package org.iswc.iswc2012main.dev;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.HashSet;
import java.util.StringTokenizer;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.ResourceFactory;
import com.hp.hpl.jena.shared.JenaException;
import com.hp.hpl.jena.vocabulary.DCTerms;
import com.hp.hpl.jena.vocabulary.RDFS;
import sw4j.rdf.util.ToolJena;
import sw4j.util.DataPVHMap;
import sw4j.util.DataSmartMap;
import sw4j.util.Sw4jException;
import sw4j.util.ToolIO;
import sw4j.util.ToolSafe;
import sw4j.util.web.ToolWeb;
/*
* input: a list of names,
* a csv file containing existing mappings
* mapping result
*/
public class ToolLinkDbpedia {
public static String VERSION = "2011-07-01";
// public static void main(String[] args){
// test_us_federal_agency();
// }
//
// public static void test_us_federal_agency(){
// String label = "gov-us-agency";
// //map_init(label);
// map_generate(label);
// //map_publish(label);
// }
//
// public static void test_us_state(){
// String label = "us-state";
// //map_generate(label);
// //publish_map(label);
// }
//
// protected static void map_init(String label){
// File fileInput= getFile(String.format("%s.txt", label));
// if (!fileInput.exists()){
// ToolIO.pipeStringToFile("", fileInput);
// }else{
// System.out.println("input file already exist: \n" + fileInput.getAbsolutePath());
// }
// }
//
// protected static void map_generate(String label){
// File fileInput= getFile(String.format("%s.txt", label));
// File fileOutput= getFile(String.format("%s-dbpedia-output.csv", label));
// File fileFinal= getFile(String.format("%s-dbpedia.csv", label));
// HashSet<String> stopWords = new HashSet<String>();
// name2dbpediaByFile(fileInput, fileOutput, loadKnownNames(fileFinal), stopWords);
// }
//
// protected static void map_publish(String label){
// File fileFinal= getFile(String.format("%s-dbpedia.csv", label));
// File fileMapping= getFile(String.format("%s-dbpedia.ttl", label));
// createMappingFile(fileFinal,fileMapping);
// }
//
public static int MAX_WIKIPEDIA_URL =3;
public static String NAME2DBPEDIA_NAME = "name";
public static String NAME2DBPEDIA_URI_FIRST = "uri_first";
public static String NAME2DBPEDIA_URI_VERIFIED = "uri_verified";
public static String NAME2DBPEDIA_MESSAGE = "message";
public static String NAME2DBPEDIA_ABBREVIATION = "dbpedia_abbreviation";
public static String NAME2DBPEDIA_DBPEDIANAME = "dbpedia_name";
public static String NAME2DBPEDIA_REDIRECT = "dbpedia_redirect";
public static String NAME2DBPEDIA_HOMEPAGE= "dbpedia_homepage";
public static Collection<String> loadKnownNames(File fileFinal){
HashSet<String> knownNames = new HashSet<String>();
if (!fileFinal.exists())
return knownNames;
ToolCsvLoader loader = new ToolCsvLoader();
try {
loader.loadCsvFile(fileFinal);
String [] id_props = new String []{
NAME2DBPEDIA_NAME,
NAME2DBPEDIA_ABBREVIATION,
};
for (DataSmartMap row : loader.m_data.values()){
String dbpedia_uri = row.getAsString(NAME2DBPEDIA_URI_VERIFIED);
if (ToolSafe.isEmpty(dbpedia_uri))
continue;
for (String id_prop: id_props){
String id = row.getAsString(id_prop);
if (!ToolSafe.isEmpty(id)){
knownNames.add(id);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
return knownNames;
}
public static void createMappingFile(File fileFinal, File fileMapping){
ToolCsvLoader loader = new ToolCsvLoader();
try {
loader.loadCsvFile(fileFinal);
String [] id_props = new String []{
NAME2DBPEDIA_NAME,
NAME2DBPEDIA_ABBREVIATION,
};
Model m = ModelFactory.createDefaultModel();
DataPVHMap<String,String> pvh = new DataPVHMap<String,String>();
for (DataSmartMap row : loader.m_data.values()){
String dbpedia_uri = row.getAsString(NAME2DBPEDIA_URI_VERIFIED);
if (ToolSafe.isEmpty(dbpedia_uri))
continue;
for (String id_prop: id_props){
String id = row.getAsString(id_prop);
if (!ToolSafe.isEmpty(id)){
pvh.add(dbpedia_uri, id);
m.add(m.createStatement(m.createResource(dbpedia_uri), DCTerms.identifier, id));
}
}
}
//attached metadata
String baseURI= "http://foo/bar";
m.createResource(baseURI)
.addProperty(DCTerms.date, m.createTypedLiteral(Calendar.getInstance()))
.addProperty(DCTerms.created, "TWC DBpedia Linking Service (ver " + VERSION+")");
ToolJena.printModelToFile(m, "N3", baseURI, fileMapping,false);
System.out.println("total dbpedia entries: " + pvh.entrySet().size());
} catch (IOException e) {
e.printStackTrace();
}
}
public static int name2dbpediaByFile(File fileInput, File fileOutput, Collection<String> knownNames, Collection<String> stopWords){
System.out.println(String.format("process name2dbpedia: input file %s --- output file %s", fileInput.getAbsolutePath(), fileOutput.getAbsolutePath()));
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(fileInput));
PrintWriter o = new PrintWriter(new FileOutputStream(fileOutput));
String line;
int cnt_verified=0;
int cnt_processed=0;
int cnt_total =0;
boolean bFirstRow = true;
while ( null!=(line=reader.readLine())){
line = line.trim();
if (line.startsWith("#") || line.length()==0)
continue;
cnt_total++;
if (knownNames.contains(line))
continue; // we already know its mapping
cnt_processed++;
DataSmartMap mapping = name2dbpedia(line, stopWords);
if (bFirstRow){
o.println(mapping.toCSVheader());
bFirstRow=false;
}
o.println(mapping.toCSVrow());
o.flush();
if (!ToolSafe.isEmpty(mapping.getAsString(NAME2DBPEDIA_URI_VERIFIED)))
cnt_verified++;
}
o.close();
System.out.println(String.format("input %d entries", cnt_total));
System.out.println(String.format("processed %d entries", cnt_processed));
System.out.println(String.format("verified %d best mappings", cnt_verified));
return cnt_verified;
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return 0;
}
public static String name2dbpediaBest(String name, Collection<String> stopWords){
return name2dbpedia(name, stopWords).getAsString(NAME2DBPEDIA_URI_VERIFIED);
}
public static DataSmartMap name2dbpedia(String name, Collection<String> stopWords){
System.out.println("\n ----------- mapping name to dbpedia------------------");
System.out.println(name);
DataSmartMap ret = new DataSmartMap();
String message = "n/a";
ret.put(NAME2DBPEDIA_NAME, name);
ret.put(NAME2DBPEDIA_MESSAGE, "n/a");
ret.put(NAME2DBPEDIA_URI_FIRST, "");
ret.put(NAME2DBPEDIA_URI_VERIFIED, "");
ret.put(NAME2DBPEDIA_ABBREVIATION, "");
ret.put(NAME2DBPEDIA_DBPEDIANAME, "");
ret.put(NAME2DBPEDIA_REDIRECT, "");
ret.put(NAME2DBPEDIA_HOMEPAGE, "");
try {
//prepare yahoo boss query
String appid = MyConfig.getProperty(MyConfig.CONFIG_YAHOO_APPID);
String query = "\""+ToolWeb.escapeHTML(name)+"\" site:en.wikipedia.org";
query = query.replaceAll("\\s", "+");
//String szUrl = String.format("http://pipes.yahoo.com/pipes/pipe.run?_id=8229ea1d5fe540124faa315e2d11e25b&_render=rss&name=%s", name.replaceAll("\\s", "+"));
String szUrl = String.format("http://boss.yahooapis.com/ysearch/web/v1/%s?appid=%s&format=xml", query ,appid);
//load query result
String szContent = ToolIO.pipeUrlToString(szUrl);
//process results
// check if we can access result
if (ToolSafe.isEmpty(szContent)){
message = "cannot load query result";
ret.put(NAME2DBPEDIA_MESSAGE, message);
return ret;
}
// check if any results has been returned
if (ToolWeb.extractMarkup(szContent, "<url>", "</url>").size()<=0){
message = "empty result set";
ret.put(NAME2DBPEDIA_MESSAGE, message);
return ret;
}
// extract dbpedia URL
String szTempLink=null;
ArrayList<String> aryUrl = ToolWeb.extractMarkup(szContent, "<url>", "</url>");
for (int i=0; i<Math.min(aryUrl.size(),MAX_WIKIPEDIA_URL); i++){
szTempLink = aryUrl.get(i);
//remove markup
szTempLink = szTempLink.replaceAll("<[^>]+>", "");
//yahoo boos specific problem, it returns URL in text form, so we need to un-escape it
// http://dbpedia.org/resource/AT&T
szTempLink = ToolWeb.unescapeHTML(szTempLink);
//switch from wikipedia URL to dbpedia URI
szTempLink = wikipediaUrl2dbpediaUri(szTempLink);
//save the first link
if (0==i){
ret.put(NAME2DBPEDIA_URI_FIRST, szTempLink);
}
//check with dbpedia to see if the uri is valid
// load dbpedia URI following linked data principle
Model m = ModelFactory.createDefaultModel();
try {
m= m.read(szTempLink);
}catch(JenaException e){
e.printStackTrace();
m = null;
}
//check if the model is valid
if (null== m){
szTempLink =null;
System.out.println("cannot load dbpedia data");
continue;
}
//check if this is a disambiguation file
if (m.listObjectsOfProperty(m.createResource(szTempLink),m.createProperty("http://dbpedia.org/ontology/wikiPageDisambiguates")).hasNext()){
szTempLink =null;
System.out.println("encounter a disambiguation page");
continue;
}
//verify the mapping is correct
if (!verifyMapping(name, m, stopWords)){
System.out.println("cannot verify the mapping");
System.out.println(szTempLink);
szTempLink =null;
}else{
message = "matched at trial "+ i;
ret.put(NAME2DBPEDIA_MESSAGE, message);
ret.put(NAME2DBPEDIA_URI_VERIFIED, szTempLink);
String temp = "";
temp = "";
for (RDFNode node: m.listObjectsOfProperty(m.createProperty("http://dbpedia.org/property/abbreviation")).toSet()){
temp = node.asLiteral().getString();
break;
}
ret.put(NAME2DBPEDIA_ABBREVIATION, temp);
temp = "";
for (RDFNode node: m.listObjectsOfProperty(m.createProperty("http://dbpedia.org/property/name")).toSet()){
if (!node.isLiteral())
continue;
temp = node.asLiteral().getString();
break;
}
ret.put(NAME2DBPEDIA_DBPEDIANAME, temp);
temp = "";
for (RDFNode node: m.listObjectsOfProperty(m.createProperty("http://xmlns.com/foaf/0.1/homepage")).toSet()){
temp = node.asResource().getURI();
break;
}
ret.put(NAME2DBPEDIA_HOMEPAGE, temp);
temp = "";
for (Resource res: m.listSubjectsWithProperty(m.createProperty("http://dbpedia.org/ontology/wikiPageRedirects")).toSet()){
String label = res.getLocalName();
if (label.matches("[A-Z]+") && ToolSafe.isEmpty(ret.getAsString(NAME2DBPEDIA_ABBREVIATION))){
ret.put(NAME2DBPEDIA_ABBREVIATION, label);
}
label = label.replaceAll("_"," ");
if (temp.length()>0)
temp +="\t";
temp +=label;
}
ret.put(NAME2DBPEDIA_REDIRECT, temp);
return ret;
}
}
//return fail when no match can be found
System.out.println(szContent);
message = String.format("cannot find match after %d trials ", Math.min(aryUrl.size(),MAX_WIKIPEDIA_URL));
ret.put(NAME2DBPEDIA_MESSAGE, message);
return ret;
} catch (Sw4jException e) {
e.printStackTrace();
}
message = "cannot load query result";
ret.put(NAME2DBPEDIA_MESSAGE, message);
return ret;
}
public static String wikipediaUrl2dbpediaUri(String szUrlWikipedia){
String szTemp = szUrlWikipedia;
szTemp = szTemp.replaceAll("&", "%26");
szTemp = szTemp.replaceAll("en.wikipedia.org/wiki", "dbpedia.org/resource");
return szTemp;
}
private static String normalizeName(String name, Collection<String> stopWords){
String temp = name;
temp = temp.replaceAll("\\.", "");
temp = temp.replaceAll("[\\W_]+"," ");
temp = temp.toLowerCase();
StringTokenizer st = new StringTokenizer(temp);
String ret ="";
while (st.hasMoreTokens()){
String token = st.nextToken();
//heuristic 1: skip common stop words
for (String stopword: new String[]{"of","the","and"}){
if (stopword.equals(token)){
token = "";
break;
}
}
//heuristic 2: skip single character word
if (token.length()<=1){
token ="";
}
//heuristic 3: skip customized stop words
if (!ToolSafe.isEmpty(stopWords)){
for (String stopword: stopWords){
if (stopword.toLowerCase().equals(token)){
token = "";
break;
}
}
}
if (ret.length()>0 && !ret.endsWith(" "))
ret +=" ";
ret += token;
}
return ret;
}
private static boolean verifyMapping(String name, Model m, Collection<String> stopWords){
String myname = normalizeName(name, stopWords);
String [][] landmark = new String[][]{
{"LABEL", normalizeName(m.listObjectsOfProperty(RDFS.label).toSet().toString(),stopWords)},
{"REDIRECT", normalizeName(m.listSubjectsWithProperty(ResourceFactory.createProperty("http://dbpedia.org/ontology/wikiPageRedirects")).toSet().toString(),stopWords)}
};
// for (Statement stmt: m.listStatements().toSet()){
// System.out.println(stmt);
// }
for (String [] entry : landmark){
if (entry[1].indexOf(myname)>=0){
System.out.println(String.format("Matched by [%s] --- \t input:%s; \t output:%s", entry[0], myname, entry[1] ));
return true;
}
}
return false;
}
public static String name2homepage(String name){
String appid = "b3Pn_9XV34FamuvQvH.pRRLjb.m01on0MVvqbgjeLF0aCfOtoTie7GRAQRqHJjz";
String query = name;
query = query.replaceAll("\\s", "+");
//String szUrl = String.format("http://pipes.yahoo.com/pipes/pipe.run?_id=8229ea1d5fe540124faa315e2d11e25b&_render=rss&name=%s", name.replaceAll("\\s", "+"));
String szUrl = String.format("http://boss.yahooapis.com/ysearch/web/v1/%s?appid=%s&format=xml", query ,appid);
try {
String szContent;
szContent = ToolIO.pipeUrlToString(szUrl);
if (ToolWeb.extractMarkup(szContent, "<url>", "</url>").size()<=0){
System.out.println(szContent);
return null;
}
String szTempLink,szTempTitle;
szTempLink = ToolWeb.extractMarkup(szContent, "<url>", "</url>").get(0);
szTempLink = szTempLink.replaceAll("<[^>]+>", "");
szTempLink = szTempLink.replaceAll("en.wikipedia.org/wiki", "dbpedia.org/resource");
szTempTitle= ToolWeb.extractMarkup(szContent, "<title>", "</title>").get(0);
szTempTitle = szTempTitle.replaceAll("<[^>]+>", "");
if (szTempTitle.toLowerCase().indexOf(name.toLowerCase())<0){
System.out.println(String.format("result:%b \t input:%s; \t output:%s", szTempTitle.toLowerCase().indexOf(name.toLowerCase())>=0, name,szTempTitle ));
System.out.println(szTempLink);
return null;
}
return szTempLink;
} catch (Sw4jException e) {
e.printStackTrace();
}
return null;
}
}