package experiments.table.limaye.corrected; import org.apache.commons.lang3.StringEscapeUtils; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import com.hp.hpl.jena.query.Query; import com.hp.hpl.jena.query.QueryExecution; import com.hp.hpl.jena.query.QueryExecutionFactory; import com.hp.hpl.jena.query.QueryFactory; import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSet; import com.hp.hpl.jena.rdf.model.Model; import experiments.table.limaye.corrected.Table.Column; public class LimayeAnnotationParserWebTables implements ContentHandler { private Table table; private StringBuilder currentValue; private boolean header; private int column; private Model m; private Model m_d; private Model m_l; public LimayeAnnotationParserWebTables(Model m, Model m_l, Model m_d) { this.table = new Table(); this.column = -1; this.currentValue = new StringBuilder(); this.header = false; this.m = m; this.m_d = m_d; this.m_l = m_l; } @Override public void setDocumentLocator(Locator locator) { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void endDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { // TODO Auto-generated method stub } @Override public void endPrefixMapping(String prefix) throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { if(localName.equals("header")) { this.header = true; } if(localName.equals("cell") && this.header) { this.table.addColumn(""); } if(localName.equals("cell")) { column++; currentValue = new StringBuilder(); } if(localName.equals("row")) { column = -1; } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if(localName.equals("html")) { String cellValue = escape(currentValue.toString().trim()); Column c = table.getColumn(column); c.addCell(cellValue); currentValue = new StringBuilder(); } if(localName.equals("wikipedia")) { String gt = "http://dbpedia.org/resource/"+currentValue.toString().trim(); gt = checkRedirects(gt); gt = checkAvailability(gt); // if (!gt.equalsIgnoreCase("")) { // gt = checkDisambiguationPage(gt); // } // gt = checkAvailability(gt); // if (!gt.equalsIgnoreCase("")) { // System.out.println("Groundtruth"+gt); // } table.getColumn(column).addLastCellGT(gt); currentValue = new StringBuilder(); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (currentValue != null) { for (int i = start; i < start + length; i++) { currentValue.append(ch[i]); } } } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String target, String data) throws SAXException { // TODO Auto-generated method stub } @Override public void skippedEntity(String name) throws SAXException { // TODO Auto-generated method stub } public Table getTable() { return table; } public static String removeAccents(String notNullSource) { return notNullSource; // return Normalizer.normalize(notNullSource, // Normalizer.Form.NFD).replaceAll( // "\\p{InCombiningDiacriticalMarks}+", ""); } private String escape(String s) { String val = StringEscapeUtils.unescapeHtml4(s); val = val.replaceAll("&", "&"); return val.replaceAll("'", "'"); } public static void main(String args[]) { String test = "'"; System.out.println(StringEscapeUtils.unescapeHtml4(test).replaceAll("'", "'")); } private String checkAvailability(String resource) { try { Query query = QueryFactory .create("SELECT ?label WHERE{ <" + resource + "> <http://www.w3.org/2000/01/rdf-schema#label> ?label. }"); QueryExecution qe = QueryExecutionFactory.create(query, this.m_l); ResultSet results = qe.execSelect(); if (results.hasNext()) { return resource; } } catch (Exception e) { return ""; } return ""; } private String checkRedirects(String resource) { String result = resource; try { Query query = QueryFactory .create("SELECT ?redirect WHERE{ <" + resource + "> <http://dbpedia.org/ontology/wikiPageRedirects> ?redirect. }"); QueryExecution qe = QueryExecutionFactory.create(query, this.m); ResultSet results = qe.execSelect(); while (results.hasNext()) { QuerySolution sol = results.nextSolution(); result = sol.getResource("redirect").getURI(); } } catch (Exception e) { return resource; } return result; } }