/** * Copyright 2014 Diego Ceccarelli * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.cnr.isti.hpc.erd; import it.acubelab.smaph.SmaphAnnotator; import it.acubelab.smaph.SmaphConfig; import it.acubelab.smaph.boldfilters.*; import it.acubelab.smaph.entityfilters.*; import it.acubelab.smaph.learn.GenerateModel; import it.acubelab.smaph.linkback.BaselineLinkBack; import it.acubelab.smaph.linkback.DummyLinkBack; import it.unipi.di.acube.batframework.problems.CandidatesSpotter; import it.unipi.di.acube.batframework.problems.Sa2WSystem; import it.unipi.di.acube.batframework.systemPlugins.TagmeAnnotator; import it.unipi.di.acube.batframework.systemPlugins.WATAnnotator; import it.unipi.di.acube.batframework.utils.*; import it.unipi.di.acube.batframework.data.MultipleAnnotation; import java.io.*; import java.util.*; public class Annotator { public static final String SMAPH_PARAMS_FORMAT = "BING-auxAnnotator=%s&minLp=%.5f&sortBy=%s&method=%s&relatedness=%s&epsilon=%.5f&spotFilter=%s&spotFilterThreshold=%f&entityFilter=%s&svmEntityFilterModelBase=%s&emptyQueryFilter=%s&svmEmptyQueryFilterModelBase=%s&entitySources=%s"; private static WikipediaApiInterface wikiApi = null; private static WikipediaToFreebase wikiToFreeb = null; private static TagmeAnnotator tagme = null; private static LibSvmEntityFilter libSvmEntityFilter = null; private String bingKey; private String tagmeKey; private String tagmeHost; public Annotator() { SmaphConfig.setConfigFile("smaph-config.xml"); bingKey = SmaphConfig.getDefaultBingKey(); String bingCache = SmaphConfig.getDefaultBingCache(); try { if (wikiApi == null) wikiApi = new WikipediaApiInterface("wid.cache", "redirect.cache"); if (bingCache != null) SmaphAnnotator.setCache(bingCache); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } if (wikiToFreeb == null) wikiToFreeb = new WikipediaToFreebase("mapdb"); } /** * Annotate a query with an annotator 'as is', picking the candidate with * highest commonness. * * @param query * the query * @param textID * an unique id for the query. * @param spotter * the annotator that spots candidates. * @return the list of annotations. */ public List<Annotation> annotateCommonness(String query, String textID, CandidatesSpotter spotter) { List<Annotation> annotations = new ArrayList<Annotation>(); HashSet<MultipleAnnotation> mas = spotter.getSpottedCandidates(query); mas = deleteOverlappingAnnotations(mas); for (MultipleAnnotation ma : mas) { Annotation a = new Annotation(); a.setQid(textID); a.setInterpretationSet(0); int wid = ma.getCandidates()[0]; String title = null; try { title = wikiApi.getTitlebyId(wid); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } String mid = wikiToFreeb.getFreebaseId(title); if (mid == null) continue; a.setPrimaryId(mid); a.setMentionText(query.substring(ma.getPosition(), ma.getPosition() + ma.getLength())); a.setScore(1.0f); annotations.add(a); } return annotations; } /** * Annotate a query with an annotator 'as is'. * * @param query * the query * @param textID * an unique id for the query. * @param annotator * the annotator to tag the query. * @return the list of annotations. */ public List<Annotation> annotatePure(String query, String textID, Sa2WSystem annotator) { List<Annotation> annotations = new ArrayList<Annotation>(); HashSet<it.unipi.di.acube.batframework.data.ScoredAnnotation> res = annotator .solveSa2W(query); System.out.printf(annotator.getName() + " found %d annotations.%n", res.size()); HashMap<Annotation, String> annToTitle = new HashMap<>(); for (it.unipi.di.acube.batframework.data.ScoredAnnotation ann : res) { Annotation a = new Annotation(); a.setQid(textID); a.setInterpretationSet(0); int wid = ann.getConcept(); String title = null; try { title = wikiApi.getTitlebyId(wid); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } String mid = wikiToFreeb.getFreebaseId(title); annToTitle.put(a, title); System.out.printf("Annotation: wid=%d mid=%s title=%s%n", wid, mid, title); if (mid == null) continue; a.setPrimaryId(mid); a.setMentionText("null"); /* * a.setMentionText(query.substring(ann.getPosition(), * ann.getPosition() + ann.getLength())); */ a.setScore(ann.getScore()); annotations.add(a); } return annotations; } /** * Handler for an annotation call. Depending on the runId, it calls * different methods to annotate a query. * * @param runId * the runId from which the configuation is picked. * @param query * the query. * @param textID * an unique id for the query. * @return the annotations of the query. */ public List<Annotation> annotate(String runId, String textID, String query) { if (runId.startsWith("miao")) { String modelFileEF = GenerateModel.getModelFileNameBaseEF( new Integer[] {1, 2, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 33, 34, 35, 36, 37}, 3.8, 5.6, 0.06, 0.03, 5.0) + "_" + "ANW"; runId = String.format(SMAPH_PARAMS_FORMAT, "wikisense", 0.0, "COMMONNESS", "base", "jaccard", 0.6f, "Frequency", 0.06, "SvmEntityFilter", modelFileEF, "NoEmptyQueryFilter", "null", "Annotator+NormalSearch+WikiSearch10"); // <--------------------------- } if (runId.equals("___reset_models")) { System.out.println("Invalidating SVM models..."); libSvmEntityFilter = null; return new Vector<>(); } if (runId.equals("___flush_cache")) { System.out.println("Flushing cache..."); try { SmaphAnnotator.flush(); wikiApi.flush(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return new Vector<>(); } String auxAnnotator = ""; String minLp = ""; String sortBy = ""; String method = ""; String relatedness = ""; String epsilon = ""; String spotFilterName = ""; String entityFilterName = ""; String svmEntityFilterModelBase = ""; float spotFilterThreshold = -1; String emptyQueryFilterName = ""; String svmEmptyQueryFilterModelBase = ""; Vector<String> entitySources = new Vector<>(); boolean includeSourceAnnotator = false; boolean includeSourceNormalSearch = false; boolean includeSourceWikiSearch = false; int wikiSearchPages = 0; boolean includeSourceAnnotatorCandidates = false; int topKannotatorCandidates = 0; boolean includeSourceRelatedSearch = false; int topKRelatedSearch = 0; { double[][] paramsToTest = new double[][] { /*{ 0.01, 1 }, { 0.01, 5 }, { 0.01, 10 }, { 0.03, 1 }, { 0.03, 5 }, { 0.03, 10 }, { 0.044, 1 }, { 0.044, 5 }, { 0.044, 10 }, { 0.06, 1 }, { 0.06, 5 }, { 0.06, 10 },*/ { 0.03, 5 }, }; double[][] weightsToTest = new double[][] { { 3.8, 5.6 }, }; Integer[][] featuresSetsToTest = new Integer[][] { {1, 2, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 33, 34, 35, 36, 37}, }; if (runId.startsWith("ftr_test_")) { String sources = runId.substring("ftr_test_".length(), "ftr_test_XXX".length()); int idftr = Integer.parseInt(runId.substring( "ftr_test_XXX_".length(), "ftr_test_XXX_XXX".length())); int idWeight = Integer.parseInt(runId.substring( "ftr_test_XXX_XXX_".length(), "ftr_test_XXX_XXX_XXX".length())); double edThr = Double.parseDouble(runId.substring( "ftr_test_XXX_XXX_XXX_".length(), "ftr_test_XXX_XXX_XXX_XXXX".length())); int idParam = Integer.parseInt(runId.substring( "ftr_test_XXX_XXX_XXX_XXXX_".length(), "ftr_test_XXX_XXX_XXX_XXXX_XXX".length())); String sourcesString = ""; for (char c : sources.toCharArray()) if (c == 'A') sourcesString += "Annotator+"; else if (c == 'W') sourcesString += "WikiSearch10+"; else if (c == 'N') sourcesString += "NormalSearch+"; sourcesString = sourcesString.substring(0, sourcesString.length() - 1); double wPos = weightsToTest[idWeight][0]; double wNeg = weightsToTest[idWeight][1]; double gamma = paramsToTest[idParam][0]; double C = paramsToTest[idParam][1]; String modelFileEF = GenerateModel.getModelFileNameBaseEF( featuresSetsToTest[idftr], wPos, wNeg, edThr, gamma, C) + "_" + sources; runId = String.format(SMAPH_PARAMS_FORMAT, "wikisense", 0.0, "COMMONNESS", "base", "jaccard", 0.6f, "Frequency", edThr, "SvmEntityFilter", modelFileEF, "NoEmptyQueryFilter", "null", sourcesString); } } if (runId.startsWith("BING-")) { for (String paramSet : runId.substring(5).split("&")) { if (paramSet.split("=").length == 1) continue; String paramName = paramSet.split("=")[0]; String paramValue = paramSet.split("=")[1]; if (paramName.equals("auxAnnotator")) auxAnnotator = paramValue; if (paramName.equals("minLp")) minLp = paramValue; if (paramName.equals("sortBy")) sortBy = paramValue; if (paramName.equals("method")) method = paramValue; if (paramName.equals("relatedness")) relatedness = paramValue; if (paramName.equals("spotFilterThreshold")) spotFilterThreshold = Float.parseFloat(paramValue); if (paramName.equals("epsilon")) epsilon = paramValue; if (paramName.equals("spotFilter")) spotFilterName = paramValue; if (paramName.equals("entityFilter")) entityFilterName = paramValue; if (paramName.equals("svmEntityFilterModelBase")) svmEntityFilterModelBase = paramValue; if (paramName.equals("emptyQueryFilter")) emptyQueryFilterName = paramValue; if (paramName.equals("svmEmptyQueryFilterModelBase")) svmEmptyQueryFilterModelBase = paramValue; if (paramName.equals("entitySources")) for (String srcName : paramValue.split("\\+")) entitySources.add(srcName); } int sourcesCount = 0; if (entitySources.contains("Annotator")) { includeSourceAnnotator = true; sourcesCount++; } if (entitySources.contains("NormalSearch")) { includeSourceNormalSearch = true; sourcesCount++; } for (String src : entitySources) { if (src.startsWith("WikiSearch")) { includeSourceWikiSearch = true; wikiSearchPages = Integer.parseInt(src .substring("WikiSearch".length())); sourcesCount++; } if (src.startsWith("AnnotatorCandidates")) { includeSourceAnnotatorCandidates = true; topKannotatorCandidates = Integer.parseInt(src .substring("AnnotatorCandidates".length())); sourcesCount++; } if (src.startsWith("RelatedSearch")) { includeSourceRelatedSearch = true; topKRelatedSearch = Integer.parseInt(src .substring("RelatedSearch".length())); sourcesCount++; } } if (sourcesCount != entitySources.size()) throw new RuntimeException("Unrecognized Source."); System.out .printf("Parameters: annotator=%s, minLp=%s, sortBy=%s, method=%s, relatedness=%s, spotFilter=%s, spotManagerThreshold=%f entityFilter=%s svmEntityFilterModel=%s emptyQueryFilterName=%s svmEmptyQueryFilterModel=%s includeSourceAnnotator=%b includeSourceNormalSearch=%b includeSourceWikiSearch=%b (wikiSearchPages=%d) includeSourceAnnotatorCandidates=%b (topKannotatorCandidates=%d)%n", auxAnnotator, minLp, sortBy, method, relatedness, spotFilterName, spotFilterThreshold, entityFilterName, svmEntityFilterModelBase, emptyQueryFilterName, svmEmptyQueryFilterModelBase, includeSourceAnnotator, includeSourceNormalSearch, includeSourceWikiSearch, wikiSearchPages, includeSourceAnnotatorCandidates, topKannotatorCandidates); WATAnnotator auxAnnotatorService = new WATAnnotator( "wikisense.mkapp.it", 80, method, sortBy, relatedness, epsilon, minLp, false, false, false); BoldFilter spotFilter = null; if (spotFilterName.equals("RankWeight")) spotFilter = new RankWeightBoldFilter(spotFilterThreshold); else if (spotFilterName.equals("Frequency")) spotFilter = new FrequencyBoldFilter(spotFilterThreshold); else if (spotFilterName.equals("EditDistanceSpotFilter")) spotFilter = new EditDistanceBoldFilter(spotFilterThreshold); else if (spotFilterName.equals("NoSpotFilter")) spotFilter = new NoBoldFilter(); EntityFilter entityFilter = null; if (entityFilterName.equals("NoEntityFilter")) entityFilter = new NoEntityFilter(); else if (entityFilterName.equals("SvmEntityFilter")) { synchronized (Annotator.class) { if (!svmEntityFilterModelBase.equals("") && (libSvmEntityFilter == null || !libSvmEntityFilter .getModel() .equals(svmEntityFilterModelBase))) { try { libSvmEntityFilter = new LibSvmEntityFilter( svmEntityFilterModelBase); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } entityFilter = libSvmEntityFilter; } } List<Annotation> res = annotatePure(query, textID, new SmaphAnnotator(auxAnnotatorService, spotFilter, entityFilter, new DummyLinkBack(), includeSourceAnnotator, includeSourceNormalSearch, includeSourceWikiSearch, wikiSearchPages, includeSourceAnnotatorCandidates, topKannotatorCandidates, includeSourceRelatedSearch, topKRelatedSearch, wikiApi, bingKey)); return res; } else if (runId.equals("tagme")) { if (tagme == null) { tagmeHost = SmaphConfig.getDefaultTagmeHost(); tagmeKey = SmaphConfig.getDefaultTagmeKey(); tagme = new TagmeAnnotator(tagmeHost, tagmeKey); } return annotatePure(query, textID, tagme); } else if (runId.equals("void")) return new Vector<>(); throw new RuntimeException("unrecognized runID=" + runId); } private static HashSet<MultipleAnnotation> deleteOverlappingAnnotations( HashSet<MultipleAnnotation> anns) { Vector<MultipleAnnotation> annsList = new Vector<MultipleAnnotation>( anns); HashSet<MultipleAnnotation> res = new HashSet<MultipleAnnotation>(); Collections.sort(annsList); for (int i = 0; i < annsList.size(); i++) { MultipleAnnotation bestCandidate = annsList.get(i); int j = i + 1; while (j < annsList.size() && bestCandidate.overlaps(annsList.get(j))) { if (bestCandidate.getLength() < annsList.get(j).getLength()) bestCandidate = annsList.get(j); j++; } i = j - 1; res.add(bestCandidate); } return res; } }