package experiments.table.limaye.corrected; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.LinkedList; import java.util.List; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.ByteArrayEntity; import org.apache.http.entity.ContentType; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.message.BasicHeader; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; import org.apache.http.util.EntityUtils; import org.rdfhdt.hdt.hdt.HDT; import org.rdfhdt.hdt.hdt.HDTManager; import org.rdfhdt.hdtjena.HDTGraph; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import com.google.gson.Gson; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import DisambiguationApproachDPO.DisambiguatedEntity; import DisambiguationApproachDPO.DisambiguationRequest; import DisambiguationApproachDPO.DisambiguationResponse; import DisambiguationApproachDPO.EntityDisambiguationDPO; import DisambiguationApproachDPO.Response; import experiments.table.limaye.corrected.Table.Column; import experiments.table.limaye.corrected.Table.Column.Cell; public class StartEvaluationTableEntities { public static final String DISAMBIGUATIONSERVICE = "http://theseus.dimis.fim.uni-passau.de:8080/doser-disambiguationserver/disambiguation/disambiguationWithoutCategories-collective"; private final static String REDIRECTS = "/home/quh/HDT/redirects.hdt"; private final static String LABELS = "/home/quh/HDT/labels.hdt"; // private final static String TYPES = "/home/quh/HDT/instance-types.hdt"; public static int sum = 0; public static int correct = 0; public static int annotated = 0; public static int haveoneresult = 0; public static int disambiguationpages = 0; public static void main(String[] args) { StartEvaluationTableEntities evaluate = new StartEvaluationTableEntities(); evaluate.action(); } public void action() { HDT hdt = null; HDT hdt_l = null; HDT hdt_d = null; Model m = null; Model m_l = null; Model m_d = null; try { hdt = HDTManager.mapIndexedHDT(REDIRECTS, null); hdt_l = HDTManager.mapIndexedHDT(LABELS, null); // hdt_d = HDTManager.mapIndexedHDT(TYPES, null); } catch (IOException e) { e.printStackTrace(); } HDTGraph graph = new HDTGraph(hdt); m = ModelFactory.createModelForGraph(graph); graph = new HDTGraph(hdt_l); m_l = ModelFactory.createModelForGraph(graph); File file = new File("/home/quh/Arbeitsfläche/Table Disambiguation Data sets/LimayeAll/all_tables_raw(regen)/"); File[] f = file.listFiles(); int cellsOverall = 0; int cellsAnnotated = 0; for (int u = 0; u < f.length; u++) { // System.out.println(f[u].getAbsolutePath()); StartEvaluationTableEntities eval = new StartEvaluationTableEntities(); String sourcePath = f[u].getAbsolutePath(); Table t = eval.readTable(f[u].getAbsolutePath(), m, m_l, m_d); if (t != null) { int cols = t.getNumberofColumns(); for (int i = 0; i < cols; i++) { Column col = t.getColumn(i); List<Cell> cellL = col.getCellList(); List<String> types = col.getMajorTypes(); cellsOverall++; // if(types != null && types.size() > 0) { // cellsAnnotated++; // } for (Cell c : cellL) { cellsOverall++; if (c.getGt() != null && !c.getGt().equalsIgnoreCase("")) { cellsAnnotated++; } } } System.out.println("Zellen insgesamt: " + cellsOverall + " Zellen annotiert: " + cellsAnnotated); // Query each column separately for (int i = 0; i < t.getNumberofColumns(); i++) { Column column = t.getColumn(i); List<EntityDisambiguationDPO> request_dpo = eval.transformInRequestFormat(column); String topic = column.getHeader(); List<Response> l = queryService(request_dpo, topic); setDisambiguatedColumn(t, i, l); } StartEvaluationTableEntities.evaluateResults(t); } } System.out.println("Insgesamt: " + sum + " davon richtig: " + correct); } private static List<Response> queryService(List<EntityDisambiguationDPO> dpos, String topic) { DisambiguationRequest req = new DisambiguationRequest(); req.setDocsToReturn(1); req.setDocumentUri("TestUrl"); req.setSurfaceFormsToDisambiguate(dpos); // req.setMainTopic(topic); HttpParams my_httpParams = new BasicHttpParams(); HttpConnectionParams.setConnectionTimeout(my_httpParams, 3000); HttpConnectionParams.setSoTimeout(my_httpParams, 0); DefaultHttpClient httpclient = new DefaultHttpClient(my_httpParams); HttpPost httppost = new HttpPost(DISAMBIGUATIONSERVICE); Header[] headers = { new BasicHeader("Accept", "application/json"), new BasicHeader("content-type", "application/json") }; httppost.setHeaders(headers); Gson gson = new Gson(); String json = null; json = gson.toJson(req); ByteArrayEntity ent = new ByteArrayEntity(json.getBytes(), ContentType.create("application/json")); httppost.setEntity(ent); HttpResponse response; StringBuffer buffer = new StringBuffer(); try { response = httpclient.execute(httppost); HttpEntity httpent = response.getEntity(); buffer.append(EntityUtils.toString(httpent)); } catch (ClientProtocolException e) { System.out.println(e); } catch (IOException e) { System.out.println(e); } finally { httpclient.getConnectionManager().shutdown(); } // System.out.println(buffer.toString()); DisambiguationResponse disResponse = gson.fromJson(buffer.toString(), DisambiguationResponse.class); List<Response> responses = disResponse.getTasks(); return responses; } private List<EntityDisambiguationDPO> transformInRequestFormat(Column c) { List<EntityDisambiguationDPO> list = new LinkedList<EntityDisambiguationDPO>(); List<Cell> cells = c.getCellList(); for (Cell cell : cells) { EntityDisambiguationDPO dpo = new EntityDisambiguationDPO(); dpo.setDocumentId(""); dpo.setContext(cell.getCellContent()); dpo.setSelectedText(cell.getCellContent()); // System.out.println(cell.getCellContent()); dpo.setStartPosition(0); list.add(dpo); } return list; } private void setDisambiguatedColumn(Table t, int columnNr, List<Response> list) { Column col = t.getColumn(columnNr); List<Cell> cellList = col.getCellList(); for (int i = 0; i < cellList.size(); i++) { Response res = list.get(i); Cell cell = cellList.get(i); if (res == null) { cell.setDisambigutedContentString(""); } else { List<DisambiguatedEntity> disEntities = res.getDisEntities(); if (disEntities == null || disEntities.size() == 0) { cell.setDisambigutedContentString(""); } else { cell.setDisambigutedContentString(disEntities.get(0).getText()); cell.setDisambiguatedContent(disEntities.get(0).getEntityUri()); // System.out.println(cell.getCellContent()); // System.out.println(disEntities.get(0).getEntityUri()); } } } } public Table readTable(String uri, Model m, Model m_l, Model m_d) { Table t = null; boolean readIn = true; FileReader reader = null; LimayeAnnotationParserWebTables p = null; try { XMLReader xmlReader = XMLReaderFactory.createXMLReader(); reader = new FileReader(uri); InputSource inputSource = new InputSource(reader); p = new LimayeAnnotationParserWebTables(m, m_l, m_d); xmlReader.setContentHandler(p); xmlReader.parse(inputSource); t = p.getTable(); p = null; } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (NullPointerException e) { readIn = false; } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } if (!readIn) { return null; } return t; } public static void evaluateResults(Table t) { // System.out.println(t.getName()); int nrC = t.getNumberofColumns(); for (int i = 0; i < nrC; i++) { Table.Column c = t.getColumn(i); List<Cell> cList = c.getCellList(); for (int j = 0; j < cList.size(); j++) { Cell cell = cList.get(j); String gt = cell.getGt(); String val = cell.getDisambiguatedContent(); // System.out.println(val); if (gt != null && !gt.equals("") && !gt.equalsIgnoreCase("http://dbpedia.org/resource/NULL")) { if (val.equalsIgnoreCase(gt)) { correct++; } else { System.out.println("Input: " + cell.getCellContent() + "Groundtruth: " + gt + " Value: " + val); } if (val != null && !val.equalsIgnoreCase("")) { annotated++; } if(gt.contains("(disambiguation)")) { disambiguationpages++; } sum++; } } } float prec = ((float) correct / (float) annotated); float recall = ((float) correct / (float) sum); float f1 = (2 * prec * recall) / (prec + recall); float acc = ((float) correct / (float) sum); System.out.println("Precision: " + prec + " Recall: " + recall + " F1: " + f1 + " Accuracy: " + acc + "DisambiguationPages: "+disambiguationpages); } }