package experiments.table.limaye; import java.io.IOException; import java.net.URLDecoder; import java.util.HashSet; import java.util.Set; import org.apache.log4j.Logger; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdtjena.HDTGraph; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import com.hp.hpl.jena.query.Query; import com.hp.hpl.jena.query.QueryException; import com.hp.hpl.jena.query.QueryExecution; import com.hp.hpl.jena.query.QueryExecutionFactory; import com.hp.hpl.jena.query.QueryFactory; import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSet; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import experiments.table.limaye.Table.Column; import experiments.table.limaye.Table.Column.Cell; public class LimayeGroundtruthAnnotationParser implements ContentHandler { private final static String REDIRECTS = "/home/quh/HDT/redirects.hdt"; private final static String LABELS = "/home/quh/HDT/labels.hdt"; private final static String TYPES = "/home/quh/HDT/instance-types.hdt"; private Table table; private StringBuilder currentValue; private boolean cellAnnotation; private boolean columnAnnotation; private int columnNr; private int column; private int row; private Model m; private Model m_l; private Model m_d; private boolean flag = false; public LimayeGroundtruthAnnotationParser(Table table) { this.table = table; this.cellAnnotation = false; this.columnAnnotation = false; this.columnNr = 0; this.column = -1; this.row = -1; this.currentValue = new StringBuilder(); HDT hdt = null; HDT hdt_l = null; // HDT hdt_d = null; try { hdt = HDTManager.mapIndexedHDT(REDIRECTS, null); hdt_l = HDTManager.mapIndexedHDT(LABELS, null); // hdt_d = HDTManager.mapIndexedHDT(TYPES, null); } catch (IOException e) { e.printStackTrace(); } HDTGraph graph = new HDTGraph(hdt); m = ModelFactory.createModelForGraph(graph); // graph = new HDTGraph(hdt_d); // m_d = ModelFactory.createModelForGraph(graph); graph = new HDTGraph(hdt_l); m_l = ModelFactory.createModelForGraph(graph); } @Override public void setDocumentLocator(Locator locator) { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void endDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { // TODO Auto-generated method stub } @Override public void endPrefixMapping(String prefix) throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { if (localName.equals("cellAnnotatoons")) { this.cellAnnotation = true; } if (localName.equals("columnAnnotations")) { this.columnAnnotation = true; } if (columnAnnotation && localName.equals("anno")) { String gt = atts.getValue("name"); if (table.getColumn(columnNr) != null) { // table.getColumn(columnNr).addTypeGt("http://yago-knowledge.org/resource/"+gt); } // System.out.println(gt); } if (columnAnnotation && localName.equals("colAnnos")) { this.columnNr = Integer.parseInt(atts.getValue("col")); } if (cellAnnotation && localName.equals("row")) { row++; column = -1; } if (cellAnnotation && localName.equals("entity")) { flag = true; column++; } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (localName.equals("cellAnnotatoons")) { this.cellAnnotation = false; } if (cellAnnotation && localName.equals("entity")) { Column c = table.getColumn(column); // Hack wegen Carriage Return bei self closing xml tag // Sollte aber nichts ausmachen beim Limaye Datensatz if (currentValue.toString().length() > 1 && currentValue.toString() != "NULL") { try { if (c != null) { Cell ce = c.getCellList().get(row); // String gt = checkRedirects(WikiPediaUriConverter.createConformDBpediaUrifromEncodedString(currentValue.toString())); String gt = "http://dbpedia.org/resource/" + URLDecoder .decode(unescapeHTMLCharacters(currentValue .toString()), "UTF-8"); gt = checkRedirects(gt); gt = checkAvailability(gt); ce.setGt(gt); } } catch (Exception e) { e.printStackTrace(); } } else { if (c != null) { c.getCellList().get(row).setGt(""); } } flag = false; ; currentValue = new StringBuilder(); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (currentValue != null && flag) { for (int i = start; i < start + length; i++) { currentValue.append(ch[i]); } } } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String target, String data) throws SAXException { // TODO Auto-generated method stub } @Override public void skippedEntity(String name) throws SAXException { // TODO Auto-generated method stub } private String checkAvailability(String resource) { try { Query query = QueryFactory .create("SELECT ?label WHERE{ <" + resource + "> <http://www.w3.org/2000/01/rdf-schema#label> ?label. }"); QueryExecution qe = QueryExecutionFactory.create(query, this.m_l); ResultSet results = qe.execSelect(); if (results.hasNext()) { return resource; } } catch (Exception e) { return ""; } return ""; } private String checkRedirects(String resource) { String result = resource; try { Query query = QueryFactory .create("SELECT ?redirect WHERE{ <" + resource + "> <http://dbpedia.org/ontology/wikiPageRedirects> ?redirect. }"); QueryExecution qe = QueryExecutionFactory.create(query, this.m); ResultSet results = qe.execSelect(); while (results.hasNext()) { QuerySolution sol = results.nextSolution(); result = sol.getResource("redirect").getURI(); } } catch (Exception e) { return resource; } return result; } private String unescapeHTMLCharacters(String resource) { String res = resource; if (res.contains("&apos;")) { res = res.replace("&apos;", "'"); } if (res.contains("\\amp")) { res = res.replace("\\amp", "&"); } return res; } }