/** * Copyright 2014 Marco Cornolti * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.unipi.di.acube.batframework.systemPlugins; import java.io.*; import java.net.*; import java.util.*; import org.codehaus.jettison.json.JSONArray; import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; import it.unipi.di.acube.batframework.data.Annotation; import it.unipi.di.acube.batframework.data.Mention; import it.unipi.di.acube.batframework.data.MultipleAnnotation; import it.unipi.di.acube.batframework.data.ScoredAnnotation; import it.unipi.di.acube.batframework.data.ScoredTag; import it.unipi.di.acube.batframework.data.Tag; import it.unipi.di.acube.batframework.problems.CandidatesSpotter; import it.unipi.di.acube.batframework.problems.MentionSpotter; import it.unipi.di.acube.batframework.problems.Sa2WSystem; import it.unipi.di.acube.batframework.utils.AnnotationException; import it.unipi.di.acube.batframework.utils.ProblemReduction; import it.acubelab.smaph.SmaphAnnotatorDebugger; import it.acubelab.smaph.SmaphUtils; public class WATAnnotator implements Sa2WSystem, MentionSpotter, CandidatesSpotter { private static final int RETRY_N = 2; private long lastTime = 0; private boolean useContext, useTagger, bogusFilter; private final String urlTag; private final String urlSpot; private final String urlD2W; private final String method, relatedness, windowSize, minCommonness, minLinkProbability, epsilon, kappa; private String sortBy; private HashMap<String, HashMap<String, Double>> additionalInfo = new HashMap<>(); private HashMap<String, List<HashMap<String, Double>>> additionalCandidatesInfo = new HashMap<>(); private boolean brutalD2WReduction = false; private static HashMap<String, byte[]> url2jsonCache = new HashMap<>(); private static long flushCounter = 0; private static final int FLUSH_EVERY = 200; private static String resultsCacheFilename = null; public static synchronized void increaseFlushCounter() throws FileNotFoundException, IOException { flushCounter++; if ((flushCounter % FLUSH_EVERY) == 0) flush(); } public static synchronized void flush() throws FileNotFoundException, IOException { if (flushCounter > 0 && resultsCacheFilename != null) { SmaphAnnotatorDebugger.out.print("Flushing WikiSense cache... "); new File(resultsCacheFilename).createNewFile(); ObjectOutputStream oos = new ObjectOutputStream( new FileOutputStream(resultsCacheFilename)); oos.writeObject(url2jsonCache); oos.close(); SmaphAnnotatorDebugger.out .println("Flushing WikiSense cache Done."); } } public static void setCache(String cacheFilename) throws FileNotFoundException, IOException, ClassNotFoundException { if (resultsCacheFilename != null && resultsCacheFilename.equals(cacheFilename)) return; System.out.println("Loading wikisense cache..."); resultsCacheFilename = cacheFilename; if (new File(resultsCacheFilename).exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream( resultsCacheFilename)); url2jsonCache = (HashMap<String, byte[]>) ois.readObject(); ois.close(); } } public static void unSetCache() { url2jsonCache = new HashMap<>(); System.gc(); } public WATAnnotator(String ip, int port, String method) { this(ip, port, method, "PAGERANK", "mw", "", ""); } public WATAnnotator(String ip, int port, String method, String sortBy, String relatedness, String epsilon, String minLinkProbability) { this(ip, port, method, sortBy, relatedness, epsilon, minLinkProbability, false, false, false); } public WATAnnotator(String ip, int port, String method, String sortBy, String relatedness, String epsilon, String minLinkProbability, boolean useContext, boolean useTagger, boolean bogusFilter) { this.urlTag = String.format("http://%s:%d/tag/tag", ip, port); this.urlSpot = String.format("http://%s:%d/tag/spot", ip, port); this.urlD2W = String.format("http://%s:%d/tag/disambiguate", ip, port); this.method = method; this.epsilon = epsilon; this.windowSize = ""; this.minCommonness = ""; this.kappa = ""; this.useContext = useContext; this.useTagger = useTagger; this.bogusFilter = bogusFilter; this.minLinkProbability = minLinkProbability; this.sortBy = sortBy; this.relatedness = relatedness; } @Override public HashSet<Annotation> solveA2W(String text) throws AnnotationException { return null; } @Override public HashSet<Tag> solveC2W(String text) throws AnnotationException { // TODO Auto-generated method stub return null; } @Override public String getName() { return String .format("WikiSense (method=%s epsilon=%s usecontext=%b relatedness=%s sortby=%s)", method, epsilon.equals("") ? "default" : epsilon, useContext, relatedness, sortBy); } @Override public long getLastAnnotationTime() { return lastTime; } public HashSet<Annotation> solveD2WParams(String text, HashSet<Mention> mentions, String newMinCommonness, String newEpsilon, String kappa) throws JSONException { System.out.println(text.substring(0, Math.min(30, text.length()))); HashSet<Annotation> res = new HashSet<Annotation>(); JSONObject obj = null; try { obj = queryJson(text, mentions, urlD2W, generateGetParameters(newMinCommonness, newEpsilon, kappa), RETRY_N); System.out.println(obj); lastTime = obj.getJSONObject("time").getInt("total"); } catch (Exception e) { System.err .print("Got error while querying WikiSense API with GET parameters: " + generateGetParameters(newMinCommonness, newEpsilon, kappa) + " with text: " + text); e.printStackTrace(); throw new AnnotationException( "An error occurred while querying WikiSense API. Message: " + e.getMessage()); } JSONArray jsAnnotations = obj.getJSONArray("annotations"); for (int i = 0; i < jsAnnotations.length(); i++) { JSONObject js_ann = jsAnnotations.getJSONObject(i); // System.out.println(js_ann); int start = js_ann.getInt("start"); int end = js_ann.getInt("end"); int id = js_ann.getInt("id"); double lp = js_ann.getDouble("linkProb"); double commonness = js_ann.getDouble("commonness"); double rhoScore = js_ann.getDouble("rho"); double ambiguity = 1.0 / (1.0 + js_ann.getInt("ambiguity")); double localCoherence = js_ann.getDouble("localCoherence"); double pageRank = js_ann.getDouble("pageRank"); // System.out.println(text.substring(start, end) + "->" + id); Mention m = new Mention(start, end - start); if (mentions.contains(m)) res.add(new Annotation(m.getPosition(), m.getLength(), id)); String mention = text.substring(start, end); if (!additionalInfo.containsKey(mention)) additionalInfo.put(mention, new HashMap<String, Double>()); additionalInfo.get(mention).put("lp", lp); additionalInfo.get(mention).put("commonness", commonness); additionalInfo.get(mention).put("rhoScore", rhoScore); additionalInfo.get(mention).put("ambiguity", ambiguity); additionalInfo.get(mention).put("localCoherence", localCoherence); additionalInfo.get(mention).put("pageRank", pageRank); JSONArray jsRankings = js_ann.getJSONArray("ranking"); int rank = 0; for (int j = 0; j < jsRankings.length(); j++) { JSONObject jsRanking = jsRankings.getJSONObject(j); id = jsRanking.getInt("id"); commonness = jsRanking.getDouble("commonness"); double score = jsRanking.getDouble("score"); pageRank = jsRanking.getDouble("pageRank"); int synonimy = jsRanking.getInt("synonymy"); HashMap<String, Double> values = new HashMap<>(); values.put("id", (double) id); values.put("rank", (double) rank); values.put("commonness", (double) commonness); values.put("score", (double) score); values.put("pageRank", (double) pageRank); values.put("synonimy", (double) synonimy); values.put("lp", (double) lp); values.put("ambiguity", (double) ambiguity); if (!additionalCandidatesInfo.containsKey(mention)) additionalCandidatesInfo.put(mention, new Vector<HashMap<String, Double>>()); additionalCandidatesInfo.get(mention).add(values); rank++; } } return res; } @Override public HashSet<Annotation> solveD2W(String text, HashSet<Mention> mentions) throws AnnotationException { if (brutalD2WReduction) return ProblemReduction.Sa2WToD2W(this.solveSa2W(text), mentions, -1f); try { return solveD2WParams(text, mentions, minCommonness, epsilon, kappa); } catch (JSONException e) { e.printStackTrace(); throw new RuntimeException(e); } } @Override public HashSet<ScoredTag> solveSc2W(String text) throws AnnotationException { // System.out.println(text); HashSet<ScoredTag> res = new HashSet<ScoredTag>(); JSONObject obj = null; String getParameters = String.format("lang=%s", "en"); if (!method.equals("")) getParameters += String.format("&method=%s", method); if (!windowSize.equals("")) getParameters += String.format("&windowSize=%s", windowSize); if (!epsilon.equals("")) getParameters += String.format("&epsilon=%s", epsilon); if (!minCommonness.equals("")) getParameters += String.format("&minCommonness=%s", minCommonness); try { obj = queryJson(text, null, urlTag, getParameters, RETRY_N); lastTime = obj.getJSONObject("time").getInt("total"); } catch (Exception e) { System.out .print("Got error while querying WikiSense API with GET parameters: " + getParameters + " with text: " + text); throw new AnnotationException( "An error occurred while querying WikiSense API. Message: " + e.getMessage()); } try { JSONArray jsAnnotations = obj.getJSONArray("annotations"); for (int i = 0; i < jsAnnotations.length(); i++) { JSONObject js_ann = jsAnnotations.getJSONObject(i); JSONArray jsRanking = js_ann.getJSONArray("ranking"); // System.out.println(jsRanking); for (int j = 0; j < jsRanking.length(); j++) { JSONObject jsCand = jsRanking.getJSONObject(j); int id = jsCand.getInt("id"); double rho = jsCand.getDouble("score"); // System.out.println(id + " (" + rho + ")"); res.add(new ScoredTag(id, (float) rho)); } } } catch (JSONException e) { e.printStackTrace(); throw new AnnotationException(e.getMessage()); } return res; } @Override public HashSet<ScoredAnnotation> solveSa2W(String text) throws AnnotationException { // System.out.println(text); HashSet<ScoredAnnotation> res = new HashSet<ScoredAnnotation>(); JSONObject obj = null; try { obj = queryJson(text, null, urlTag, generateGetParameters(minCommonness, epsilon, kappa), RETRY_N); lastTime = obj.getJSONObject("time").getInt("total"); } catch (Exception e) { System.out .print("Got error while querying WikiSense API with GET parameters: " + generateGetParameters(minCommonness, epsilon, kappa) + " with text: " + text); throw new AnnotationException( "An error occurred while querying WikiSense API. Message: " + e.getMessage()); } try { JSONArray jsAnnotations = obj.getJSONArray("annotations"); for (int i = 0; i < jsAnnotations.length(); i++) { JSONObject js_ann = jsAnnotations.getJSONObject(i); // System.out.println(js_ann); int start = js_ann.getInt("start"); int end = js_ann.getInt("end"); int id = js_ann.getInt("id"); double rho = js_ann.getDouble("rho"); // System.out.println(text.substring(start, end) + "->" + id + // " (" // + rho + ")"); res.add(new ScoredAnnotation(start, end - start, id, (float) rho)); } } catch (JSONException e) { e.printStackTrace(); throw new AnnotationException(e.getMessage()); } return res; } @Override public HashSet<Mention> getSpottedMentions(String text) { HashSet<Mention> res = new HashSet<Mention>(); JSONObject obj = null; String getParameters = String.format("lang=%s", "en", method); try { obj = queryJson(text, null, urlSpot, getParameters, RETRY_N); // System.out.println(obj); } catch (Exception e) { System.out .print("Got error while querying WikiSense API with GET parameters: " + getParameters + " with text: " + text); throw new AnnotationException( "An error occurred while querying WikiSense API. Message: " + e.getMessage()); } try { JSONArray jsSpots = obj.getJSONArray("spots"); for (int i = 0; i < jsSpots.length(); i++) { JSONObject jsSpot = jsSpots.getJSONObject(i); // System.out.println(jsSpot); int start = jsSpot.getInt("start"); int end = jsSpot.getInt("end"); // System.out.printf("Found spot: [%s]%n", text.substring(start, // end)); Mention newMention = new Mention(start, end - start); res.add(newMention); } } catch (JSONException e) { e.printStackTrace(); throw new AnnotationException(e.getMessage()); } return res; } private String generateGetParameters(String newMinCommonness, String newEpsilon, String newKappa) { String getParameters = String.format("lang=%s", "en"); if (!method.equals("")) getParameters += String.format("&method=%s", method); if (!windowSize.equals("")) getParameters += String.format("&windowSize=%s", windowSize); if (!newEpsilon.equals("")) getParameters += String.format("&epsilon=%s", newEpsilon); if (!newMinCommonness.equals("")) getParameters += String.format("&minCommonness=%s", newMinCommonness); if (!newKappa.equals("")) getParameters += String.format("&kappa=%s", newKappa); if (!minLinkProbability.equals("")) getParameters += String.format("&minLinkProbability=%s", minLinkProbability); if (!relatedness.equals("")) getParameters += String.format("&relatedness=%s", relatedness); if (!sortBy.equals("")) getParameters += String.format("&sortBy=%s", sortBy); getParameters += "&bogusFilter=" + this.bogusFilter; getParameters += "&useTagger=" + this.useTagger; getParameters += "&useContext=" + this.useContext; return getParameters; } private JSONObject queryJson(String text, Set<Mention> mentions, String url, String getParameters, int retry) throws Exception { JSONObject parameters = new JSONObject(); if (mentions != null) { JSONArray mentionsJson = new JSONArray(); for (Mention m : mentions) { JSONObject mentionJson = new JSONObject(); mentionJson.put("start", m.getPosition()); mentionJson.put("end", m.getPosition() + m.getLength()); mentionsJson.put(mentionJson); } parameters.put("spans", mentionsJson); } parameters.put("text", text); System.out.println(getParameters); System.out.println(parameters.toString()); String resultStr = null; try { URL wikiSenseApi = new URL(String.format("%s?%s", url, getParameters)); String cacheKey = wikiSenseApi.toExternalForm() + parameters.toString(); byte[] compressed = url2jsonCache.get(cacheKey); if (compressed != null) return new JSONObject(SmaphUtils.decompress(compressed)); HttpURLConnection slConnection = (HttpURLConnection) wikiSenseApi .openConnection(); slConnection.setReadTimeout(0); slConnection.setDoOutput(true); slConnection.setDoInput(true); slConnection.setRequestMethod("POST"); slConnection.setRequestProperty("Content-Type", "application/json"); slConnection.setRequestProperty("Content-Length", "" + parameters.toString().getBytes().length); slConnection.setUseCaches(false); DataOutputStream wr = new DataOutputStream( slConnection.getOutputStream()); wr.write(parameters.toString().getBytes()); wr.flush(); wr.close(); if (slConnection.getResponseCode() != 200) { Scanner s = new Scanner(slConnection.getErrorStream()) .useDelimiter("\\A"); System.err.printf("Got HTTP error %d. Message is: %s%n", slConnection.getResponseCode(), s.next()); s.close(); } Scanner s = new Scanner(slConnection.getInputStream()) .useDelimiter("\\A"); resultStr = s.hasNext() ? s.next() : ""; JSONObject obj = new JSONObject(resultStr); url2jsonCache.put(cacheKey, SmaphUtils.compress(obj.toString())); increaseFlushCounter(); return obj; } catch (Exception e) { e.printStackTrace(); try { Thread.sleep(3000); if (retry > 0) return queryJson(text, mentions, url, getParameters, retry - 1); else throw e; } catch (InterruptedException e1) { e1.printStackTrace(); throw new RuntimeException(e1); } } } @Override public HashSet<MultipleAnnotation> getSpottedCandidates(String text) { HashSet<MultipleAnnotation> res = new HashSet<MultipleAnnotation>(); JSONObject obj = null; String getParameters = String.format( "lang=%s&includeEntities=true&sortBy=SCORE", "en"); try { obj = queryJson(text, null, urlSpot, getParameters, RETRY_N); } catch (Exception e) { System.out .print("Got error while querying WikiSense API with GET parameters: " + getParameters + " with text: " + text); throw new AnnotationException( "An error occurred while querying WikiSense API. Message: " + e.getMessage()); } try { JSONArray jsSpots = obj.getJSONArray("spots"); for (int i = 0; i < jsSpots.length(); i++) { JSONObject jsSpot = jsSpots.getJSONObject(i); int start = jsSpot.getInt("start"); int end = jsSpot.getInt("end"); JSONArray jsRanking = jsSpot.getJSONArray("ranking"); int[] rankedCandidates = new int[jsRanking.length()]; for (int j = 0; j < jsRanking.length(); j++) { JSONObject jsCand = jsRanking.getJSONObject(j); int id = jsCand.getInt("id"); rankedCandidates[j] = id; } MultipleAnnotation newAnnotation = new MultipleAnnotation( start, end - start, rankedCandidates); res.add(newAnnotation); } } catch (JSONException e) { e.printStackTrace(); throw new AnnotationException(e.getMessage()); } return res; } public HashMap<String, HashMap<String, Double>> getLastQueryAdditionalInfo() { HashMap<String, HashMap<String, Double>> clone = new HashMap<>( additionalInfo); additionalInfo.clear(); return clone; } public HashMap<String, List<HashMap<String, Double>>> getLastQueryAdditionalCandidatesInfo() { HashMap<String, List<HashMap<String, Double>>> clone = new HashMap<>( additionalCandidatesInfo); additionalCandidatesInfo.clear(); return clone; } public void setBrutalD2WReduction() { this.brutalD2WReduction = true; } }