package doser.entitydisambiguation.table.celldisambiguation; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.http.Header; import org.apache.http.entity.ByteArrayEntity; import org.apache.http.entity.ContentType; import org.apache.http.message.BasicHeader; import org.apache.log4j.Logger; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import org.codehaus.jackson.map.ObjectMapper; import doser.algorithms.MajorityVoteAlgorithm; import doser.entitydisambiguation.dpo.DisambiguatedEntity; import doser.entitydisambiguation.dpo.DisambiguationRequest; import doser.entitydisambiguation.dpo.DisambiguationResponse; import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; import doser.entitydisambiguation.dpo.Response; import doser.entitydisambiguation.table.logic.TableCell; import doser.entitydisambiguation.table.logic.TableColumn; import doser.entitydisambiguation.table.logic.Type; import doser.tools.RDFGraphOperations; import doser.tools.ServiceQueries; public final class CellDisAlgorithm_Standard implements CellDisambiguationInterface { private final static String DISPROXYURL = "http://theseus.dimis.fim.uni-passau.de:8080/doser-disambiguationserver/disambiguation/disambiguate-proxy"; private static CellDisAlgorithm_Standard instance; private final static String REGEXLABELBRACKET = "[A-Za-z0-9_ \\t\\r\\n\\v\\f]+([(][A-Za-z0-9_ \\t\\r\\n\\v\\f]+[)])[A-Za-z0-9_ \\t\\r\\n\\v\\f]*"; private final static String[] UNRELEVANTTERMS = { "births", "deaths" }; private final static String[] UNRRELEVANTTYPES = { "http://dbpedia.org/resource/Category:Living_people" }; public synchronized static CellDisAlgorithm_Standard getInstance() { if (instance == null) { instance = new CellDisAlgorithm_Standard(); } return instance; } private List<TableCell> cellsToDis; private List<TableCell> finishedDis; private Map<String, HashSet<Type>> memoryTypes; private List<Map.Entry<Type, Integer>> typeRanking; private MajorityVoteAlgorithm<Type> majorityVoteAlgorithm; private CellDisAlgorithm_Standard() { super(); this.majorityVoteAlgorithm = new MajorityVoteAlgorithm<Type>(); } private void checkFirstType() { final List<TableCell> toRemove = new LinkedList<TableCell>(); if (!this.typeRanking.isEmpty()) { while (this.hasUnrelevantType()) { this.typeRanking.remove(0); } final Type majorType = this.typeRanking.get(0).getKey(); for (int i = 0; i < this.cellsToDis.size(); i++) { final TableCell cell = this.cellsToDis.get(i); final List<DisambiguatedEntity> ents = cell .getDisambiguatedEntities(); for (int j = 0; j < ents.size(); j++) { final DisambiguatedEntity ent = ents.get(j); HashSet<Type> hashSet = this.memoryTypes.get(ent .getEntityUri()); boolean foundType = false; // BugFix - Crash NullPointerException on hashset access if (hashSet == null) hashSet = new HashSet<Type>(); for (final Type currentType : hashSet) { if (currentType.equals(majorType)) { final String decodedString = this.decode(ents .get(j).getEntityUri()); cell.setDisambiguatedContent(decodedString); cell.setDisambigutedContentString(ents.get(j) .getText()); this.finishedDis.add(cell); foundType = true; break; } } if (foundType) { toRemove.add(cell); break; } } } for (int i = 0; i < toRemove.size(); i++) { for (int j = 0; j < this.cellsToDis.size(); j++) { if (this.cellsToDis.get(j).compareTo(toRemove.get(i)) == 0) { this.cellsToDis.remove(j); break; } } } } } private void createTypeRanking(final DisambiguationResponse res) { final List<Type> types = this.searchTypes(res); this.typeRanking = majorityVoteAlgorithm.getMajorityTypes(types); } private String decode(final String uri) { String res = uri; try { res = URLDecoder.decode(uri, "UTF-8"); } catch (final UnsupportedEncodingException e) { Logger.getRootLogger().error(e.getStackTrace()); } return res; } @Override public void disambiguateCells(final TableColumn col) { // 1. Disambiguate cell with Disambiguation service final DisambiguationResponse res = this.queryDisService(col .getCellList()); for (int j = 0; j < res.getTasks().size(); j++) { col.getCellList() .get(j) .setDisambiguatedEntities( res.getTasks().get(j) .getDisEntities()); } // 2. Type Extraction this.memoryTypes = new HashMap<String, HashSet<Type>>(); this.createTypeRanking(res); // 3. Put all 1 item responses onto the stack this.finishedDis = new LinkedList<TableCell>(); this.cellsToDis = new LinkedList<TableCell>(); for (int j = 0; j < col.getCellList().size(); j++) { if (col.getCellList().get(j) != null) { final TableCell tableCell = col.getCellList().get(j); this.cellsToDis.add(tableCell); } } this.oneItemResponses(res, col); // checkConjunctions(); // Check first string types // checkStringTypes(); // 5. Check first type this.checkFirstType(); // 6. Use first response item for all others. // disambiguateLevenshtein(); this.disambiguateListItems(); // 7. Remove all relevant type item of column col.resetTypes(); } private void disambiguateListItems() { for (int i = 0; i < this.cellsToDis.size(); i++) { final List<DisambiguatedEntity> list = this.cellsToDis.get(i) .getDisambiguatedEntities(); if (list.isEmpty()) { this.cellsToDis.get(i).setDisambiguatedContent(null); this.cellsToDis.get(i).setDisambigutedContentString(null); } else { final String decodedString = this.decode(list.get(0) .getEntityUri()); this.cellsToDis.get(i).setDisambiguatedContent(decodedString); this.cellsToDis.get(i).setDisambigutedContentString( list.get(0).getText()); } } // Optional: Put all cells to disambiguate on the final list this.finishedDis.addAll(this.cellsToDis); } private List<Type> getTypes(final List<DisambiguatedEntity> entList) { final Map<Type, Integer> hash = new HashMap<Type, Integer>(); for (final DisambiguatedEntity entity : entList) { final String uri = entity.getEntityUri(); final Set<Type> types = RDFGraphOperations .getDbpediaCategoriesFromEntity(uri); final HashSet<Type> hashSet = new HashSet<Type>(); for (final Type type : types) { hash.put(type, 0); hashSet.add(type); } this.memoryTypes.put(uri, hashSet); } final List<Type> list = new LinkedList<Type>(); for (final Entry<Type, Integer> entry : hash.entrySet()) { list.add(entry.getKey()); } return list; } private boolean hasUnrelevantType() { boolean res = false; for (final String element : UNRRELEVANTTYPES) { if (element.equalsIgnoreCase(this.typeRanking.get(0).getKey() .getUri())) { res = true; break; } } if (!res) { for (final String element : UNRELEVANTTERMS) { if (this.typeRanking.get(0).getKey().getUri().contains(element)) { res = true; break; } } } return res; } /** * ToDo Cell have disambiguated entities * * @param res * @param col */ private void oneItemResponses(final DisambiguationResponse res, final TableColumn col) { final List<Response> responses = res.getTasks(); for (int i = 0; i < responses.size(); i++) { final Response ent = responses.get(i); final List<DisambiguatedEntity> entities = ent.getDisEntities(); if (entities.size() == 1) { final String decodedString = this.decode(entities.get(0) .getEntityUri()); col.getCellList().get(i).setDisambiguatedContent(decodedString); col.getCellList() .get(i) .setDisambigutedContentString(entities.get(0).getText()); this.finishedDis.add(col.getCellList().get(i)); if (col.getCellList().get(i) != null) { this.removeItemFromList(col.getCellList().get(i), this.cellsToDis); } } } } private DisambiguationResponse queryDisService( final List<TableCell> tableCells) { DisambiguationResponse res = new DisambiguationResponse(); final DisambiguationRequest disRequest = new DisambiguationRequest(); disRequest.setDocumentUri("tableDisambiguation"); final List<EntityDisambiguationDPO> lst = new LinkedList<EntityDisambiguationDPO>(); for (int i = 0; i < tableCells.size(); i++) { final EntityDisambiguationDPO ent = new EntityDisambiguationDPO(); ent.setSetting("NoContext"); String cellContent = tableCells.get(i).getCellContent(); if (cellContent.matches(REGEXLABELBRACKET)) { final String splitter[] = cellContent .split("[(][A-Za-z0-9_ \\t\\r\\n\\v\\f]+[)]"); if (splitter.length == 2) { cellContent = splitter[0] + " " + splitter[1]; } else { cellContent = splitter[0]; } } ent.setContext(""); String sfs = cellContent; ent.setSelectedText(sfs); ent.setStartPosition(-1); lst.add(ent); } disRequest.setSurfaceFormsToDisambiguate(lst); final ObjectMapper mapper = new ObjectMapper(); String json = null; byte[] jsonByteString = null; try { json = mapper.writeValueAsString(disRequest); jsonByteString = json.getBytes("UTF-8"); } catch (final JsonParseException e) { Logger.getRootLogger().error(e.getStackTrace()); } catch (final JsonMappingException e1) { Logger.getRootLogger().error(e1.getStackTrace()); } catch (final IOException e2) { Logger.getRootLogger().error(e2.getStackTrace()); } Header[] headers = { new BasicHeader("Accept", "application/json"), new BasicHeader("content-type", "application/json") }; ByteArrayEntity ent = new ByteArrayEntity(jsonByteString, ContentType.create("application/json")); String resStr = ServiceQueries.httpPostRequest(DISPROXYURL, ent, headers); try { res = mapper.readValue(resStr, DisambiguationResponse.class); } catch (final JsonParseException e) { Logger.getRootLogger().error(e.getStackTrace()); } catch (final JsonMappingException e1) { Logger.getRootLogger().error(e1.getStackTrace()); } catch (final IOException e2) { Logger.getRootLogger().error(e2.getStackTrace()); } return res; } private void removeItemFromList(final TableCell cell, final List<TableCell> list) { for (int i = 0; i < list.size(); i++) { if (list.get(i).compareTo(cell) == 0) { list.remove(i); break; } } } public List<Type> searchTypes(final DisambiguationResponse response) { final List<Response> lst = response.getTasks(); final List<Type> res = new LinkedList<Type>(); for (final Response entity : lst) { final List<DisambiguatedEntity> ent = entity.getDisEntities(); res.addAll(this.getTypes(ent)); } return res; } }