/** * Copyright 2014 Marco Cornolti * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.acubelab.smaph; import it.unipi.di.acube.batframework.data.*; import it.unipi.di.acube.batframework.problems.Sa2WSystem; import it.unipi.di.acube.batframework.systemPlugins.WATAnnotator; import it.unipi.di.acube.batframework.utils.*; import it.acubelab.smaph.boldfilters.*; import it.acubelab.smaph.entityfilters.*; import it.acubelab.smaph.linkback.LinkBack; import it.acubelab.smaph.main.ERDDatasetFilter; import it.cnr.isti.hpc.erd.WikipediaToFreebase; import java.io.*; import java.net.*; import java.util.*; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathExpressionException; import org.apache.commons.lang3.tuple.ImmutableTriple; import org.apache.commons.lang3.tuple.Triple; import org.codehaus.jettison.json.*; import org.xml.sax.SAXException; import com.sun.org.apache.xml.internal.security.utils.Base64; public class SmaphAnnotator implements Sa2WSystem { private static final String WIKI_URL_LEADING = "http://en.wikipedia.org/wiki/"; private static final int BING_RETRY = 3; private String bingKey; private static final int FLUSH_EVERY = 50; public static final String WIKITITLE_ENDPAR_REGEX = "\\s*\\([^\\)]*\\)\\s*$"; private static HashMap<String, byte[]> url2jsonCache = new HashMap<>(); private static String resultsCacheFilename; private static int flushCounter = 0; private WikipediaApiInterface wikiApi; private WATAnnotator auxDisambiguator; private BoldFilter boldFilter; private EntityFilter entityFilter; private LinkBack linkBack; private boolean includeSourceNormalSearch; private boolean includeSourceAnnotator; private boolean includeSourceWikiSearch; private int topKWikiSearch = 0; private boolean includeSourceRelatedSearch; private int topKRelatedSearch; private SmaphAnnotatorDebugger debugger; /** * Constructs a SMAPH annotator. * * @param auxDisambiguator * the disambiguator used in Source 1. * @param boldFilter * the filter of the bolds used in Source 1. * @param entityFilter * the entity filter used in the second stage. * @param includeSourceAnnotator * true iff Source 1 has to be enabled. * @param includeSourceNormalSearch * true iff Source 2 has to be enabled. * @param includeSourceWikiSearch * true iff Source 3 has to be enabled. * @param wikiSearchPages * Source 3 results limit. * @param includeRelatedSearch * true iff Source 4 has to be enabled. * @param topKRelatedSearch * Source 4 results limit. * @param wikiApi * an API to Wikipedia. * @param bingKey * the key to the Bing API. */ public SmaphAnnotator(WATAnnotator auxDisambiguator, BoldFilter boldFilter, EntityFilter entityFilter, LinkBack linkBack, boolean includeSourceAnnotator, boolean includeSourceNormalSearch, boolean includeSourceWikiSearch, int wikiSearchPages, boolean includeSourceAnnotatorTopK, int topKAnnotatorCandidates, boolean includeRelatedSearch, int topKRelatedSearch, WikipediaApiInterface wikiApi, String bingKey) { this.auxDisambiguator = auxDisambiguator; this.boldFilter = boldFilter; this.entityFilter = entityFilter; this.linkBack = linkBack; this.wikiApi = wikiApi; this.includeSourceAnnotator = includeSourceAnnotator; this.includeSourceNormalSearch = includeSourceNormalSearch; this.includeSourceWikiSearch = includeSourceWikiSearch; this.topKWikiSearch = wikiSearchPages; this.includeSourceRelatedSearch = includeRelatedSearch; this.topKRelatedSearch = topKRelatedSearch; this.bingKey = bingKey; } /** * Set an optional debugger to gather data about the process of a query. * * @param debugger * the debugger. */ public void setDebugger(SmaphAnnotatorDebugger debugger) { this.debugger = debugger; } private static synchronized void increaseFlushCounter() throws FileNotFoundException, IOException { flushCounter++; if (flushCounter % FLUSH_EVERY == 0) flush(); } /** * Flushes the cache of the Bing api. * * @throws FileNotFoundException * if the file exists but is a directory rather than a regular * file, does not exist but cannot be created, or cannot be * opened for any other reason. * @throws IOException * if an I/O error occurred. */ public static synchronized void flush() throws FileNotFoundException, IOException { if (flushCounter > 0 && resultsCacheFilename != null) { SmaphAnnotatorDebugger.out.print("Flushing Bing cache... "); new File(resultsCacheFilename).createNewFile(); ObjectOutputStream oos = new ObjectOutputStream( new FileOutputStream(resultsCacheFilename)); oos.writeObject(url2jsonCache); oos.close(); SmaphAnnotatorDebugger.out.println("Flushing Bing cache Done."); } } @Override public HashSet<Annotation> solveA2W(String text) throws AnnotationException { return ProblemReduction.Sa2WToA2W(solveSa2W(text)); } @Override public HashSet<Tag> solveC2W(String text) throws AnnotationException { return ProblemReduction.A2WToC2W(ProblemReduction .Sa2WToA2W(solveSa2W(text))); } @Override public String getName() { return "Smaph annotator"; } @Override public long getLastAnnotationTime() { return 0; } @Override public HashSet<Annotation> solveD2W(String text, HashSet<Mention> mentions) throws AnnotationException { return null; } @Override public HashSet<ScoredTag> solveSc2W(String text) throws AnnotationException { return null; } /** * Call the disambiguator and disambiguate the bolds. * * @param text * concatenated bolds. * @param mentions * mentions (one per bold). * @return a triple that has: additional info returned by the annotator for * the query as left element; the mapping from bold to annotation as * middle element; additional candidates info as right element. * @throws IOException * @throws XPathExpressionException * @throws ParserConfigurationException * @throws SAXException */ private Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>> disambiguateBolds( String text, HashSet<Mention> mentions) throws IOException, XPathExpressionException, ParserConfigurationException, SAXException { HashSet<Annotation> anns; anns = auxDisambiguator.solveD2W(text, mentions); if (anns == null) return new Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>>( new HashMap<String, HashMap<String, Double>>(), new HashMap<String, Annotation>()); List<Integer> widsToPrefetch = new Vector<>(); for (Annotation ann : anns) widsToPrefetch.add(ann.getConcept()); wikiApi.prefetchWids(widsToPrefetch); HashMap<String, List<HashMap<String, Double>>> additionalCandidatesInfo = auxDisambiguator .getLastQueryAdditionalCandidatesInfo(); for (String mention : additionalCandidatesInfo.keySet()) additionalCandidatesInfo.put(mention, additionalCandidatesInfo.get(mention)); HashMap<String, Annotation> spotToAnnotation = new HashMap<>(); for (Annotation ann : anns) spotToAnnotation.put( text.substring(ann.getPosition(), ann.getPosition() + ann.getLength()), ann); return new Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>>( auxDisambiguator.getLastQueryAdditionalInfo(), spotToAnnotation); } @Override public HashSet<ScoredAnnotation> solveSa2W(String query) throws AnnotationException { if (debugger != null) debugger.addProcessedQuery(query); HashSet<ScoredAnnotation> annotations = new HashSet<>(); try { /** Search the query on bing */ List<Pair<String, Integer>> bingBoldsAndRankNS = null; List<String> urls = null; List<String> relatedSearchRes = null; Triple<Integer, Double, JSONObject> resCountAndWebTotalNS = null; int resultsCount = -1; double webTotalNS = Double.NaN; List<String> filteredBolds = null; HashMap<Integer, Integer> rankToIdNS = null; HashMap<Integer, HashSet<String>> rankToBoldsNS = null; List<Pair<String, Vector<Pair<Integer, Integer>>>> snippetsToBolds = null; if (includeSourceAnnotator || includeSourceWikiSearch || includeSourceRelatedSearch || includeSourceNormalSearch) { bingBoldsAndRankNS = new Vector<>(); urls = new Vector<>(); relatedSearchRes = new Vector<>(); snippetsToBolds = new Vector<>(); resCountAndWebTotalNS = takeBingData(query, bingBoldsAndRankNS, urls, relatedSearchRes, snippetsToBolds, Integer.MAX_VALUE, false); resultsCount = resCountAndWebTotalNS.getLeft(); webTotalNS = resCountAndWebTotalNS.getMiddle(); filteredBolds = boldFilter.filterBolds(query, bingBoldsAndRankNS, resultsCount); rankToIdNS = urlsToRankID(urls); rankToBoldsNS = new HashMap<>(); SmaphUtils.mapRankToBoldsLC(bingBoldsAndRankNS, rankToBoldsNS, null); if (debugger != null) { debugger.addBoldPositionEditDistance(query, bingBoldsAndRankNS); debugger.addSnippets(query, snippetsToBolds); debugger.addBoldFilterOutput(query, filteredBolds); debugger.addSource2SearchResult(query, rankToIdNS, urls); debugger.addBingResponseNormalSearch(query, resCountAndWebTotalNS.getRight()); } } /** Do the WikipediaSearch on bing. */ List<String> wikiSearchUrls = new Vector<>(); List<Pair<String, Integer>> bingBoldsAndRankWS = new Vector<>(); HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankWS = null; Triple<Integer, Double, JSONObject> resCountAndWebTotalWS = null; HashMap<Integer, HashSet<String>> rankToBoldsWS = null; double webTotalWS = Double.NaN; if (includeSourceWikiSearch | includeSourceNormalSearch) { resCountAndWebTotalWS = takeBingData(query, bingBoldsAndRankWS, wikiSearchUrls, null, null, topKWikiSearch, true); webTotalWS = resCountAndWebTotalWS.getMiddle(); HashMap<Integer, Integer> rankToIdWikiSearch = urlsToRankID(wikiSearchUrls); rankToBoldsWS = new HashMap<>(); SmaphUtils.mapRankToBoldsLC(bingBoldsAndRankWS, rankToBoldsWS, null); if (debugger != null) { debugger.addSource3SearchResult(query, rankToIdWikiSearch, wikiSearchUrls); debugger.addBingResponseWikiSearch(query, resCountAndWebTotalWS.getRight()); } annTitlesToIdAndRankWS = adjustTitles(rankToIdWikiSearch); } /** Do the RelatedSearch on bing */ String relatedSearch = null; List<String> relatedSearchUrls = null; List<Pair<String, Integer>> bingBoldsAndRankRS = null; HashMap<Integer, Integer> rankToIdRelatedSearch = null; HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankRS = null; double webTotalRelatedSearch = Double.NaN; HashMap<Integer, HashSet<String>> rankToBoldsRS = null; if (includeSourceRelatedSearch) { relatedSearch = getRelatedSearch(relatedSearchRes, query); relatedSearchUrls = new Vector<>(); bingBoldsAndRankRS = new Vector<>(); Triple<Integer, Double, JSONObject> resCountAndWebTotalRS = takeBingData( query, bingBoldsAndRankRS, relatedSearchUrls, null, null, topKRelatedSearch, false); webTotalRelatedSearch = resCountAndWebTotalRS.getMiddle(); rankToIdRelatedSearch = urlsToRankID(relatedSearchUrls); annTitlesToIdAndRankRS = adjustTitles(rankToIdRelatedSearch); rankToBoldsRS = new HashMap<>(); SmaphUtils.mapRankToBoldsLC(bingBoldsAndRankRS, rankToBoldsRS, null); } /** Annotate bolds on the annotator */ Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>> infoAndAnnotations = null; HashMap<String, Annotation> spotToAnnotation = null; HashMap<String, HashMap<String, Double>> additionalInfo = null; Pair<String, HashSet<Mention>> annInput = null; if (includeSourceAnnotator) { annInput = concatenateBolds(filteredBolds); infoAndAnnotations = disambiguateBolds(annInput.first, annInput.second); spotToAnnotation = infoAndAnnotations.second; additionalInfo = infoAndAnnotations.first; if (debugger != null) debugger.addReturnedAnnotation(query, spotToAnnotation); } HashMap<String[], Tag> boldsToAcceptedEntity = new HashMap<>(); // Filter and add annotations found by the disambiguator if (includeSourceAnnotator) { for (String bold : filteredBolds) { if (spotToAnnotation.containsKey(bold)) { Annotation ann = spotToAnnotation.get(bold); HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesAnnotator( query, resultsCount, ann, annInput, bingBoldsAndRankNS, additionalInfo); boolean accept = entityFilter.filterEntity(ESFeatures); if (accept) boldsToAcceptedEntity.put(new String[] { bold }, new Tag(ann.getConcept())); if (debugger != null) { HashSet<String> bolds = new HashSet<>(); bolds.add(bold); debugger.addQueryCandidateBolds(query, "Source 1", ann.getConcept(), bolds); debugger.addEntityFeaturesS1(query, bold, ann.getConcept(), ESFeatures, accept); if (accept) debugger.addResult(query, ann.getConcept()); } } } } // Filter and add entities found in the normal search if (includeSourceNormalSearch) { for (int rank : rankToIdNS.keySet()) { int wid = rankToIdNS.get(rank); HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch( query, wid, rank, webTotalNS, webTotalWS, bingBoldsAndRankNS, 2); HashSet<String> bolds = rankToBoldsNS.get(rank); boolean accept = entityFilter.filterEntity(ESFeatures); if (accept) boldsToAcceptedEntity.put( bolds.toArray(new String[] {}), new Tag(wid)); if (debugger != null) { debugger.addQueryCandidateBolds(query, "Source 2", wid, bolds); debugger.addEntityFeaturesS2(query, wid, ESFeatures, accept); if (accept) debugger.addResult(query, wid); } } } // Filter and add entities found in the WikipediaSearch if (includeSourceWikiSearch) { for (String annotatedTitleWS : annTitlesToIdAndRankWS.keySet()) { int wid = annTitlesToIdAndRankWS.get(annotatedTitleWS).first; int rank = annTitlesToIdAndRankWS.get(annotatedTitleWS).second; HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch( query, wid, rank, webTotalNS, webTotalWS, bingBoldsAndRankWS, 3); HashSet<String> bolds = rankToBoldsWS.get(rank); boolean accept = entityFilter.filterEntity(ESFeatures); if (accept) boldsToAcceptedEntity.put( bolds.toArray(new String[] {}), new Tag(wid)); if (debugger != null) { debugger.addQueryCandidateBolds(query, "Source 3", wid, bolds); debugger.addEntityFeaturesS3(query, wid, ESFeatures, accept); if (accept) debugger.addResult(query, wid); } } } // Filter and add entities found in the RelatedSearch if (includeSourceRelatedSearch) { for (String annotatedTitleRS : annTitlesToIdAndRankRS.keySet()) { int wid = annTitlesToIdAndRankRS.get(annotatedTitleRS).first; int rank = annTitlesToIdAndRankRS.get(annotatedTitleRS).second; HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch( relatedSearch, wid, rank, webTotalNS, webTotalRelatedSearch, bingBoldsAndRankRS, 5); HashSet<String> bolds = rankToBoldsRS.get(rank); boolean accept = entityFilter.filterEntity(ESFeatures); if (accept) boldsToAcceptedEntity.put( bolds.toArray(new String[] {}), new Tag(wid)); } } /** Link entities back to query mentions */ annotations = linkBack.linkBack(query, boldsToAcceptedEntity); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } SmaphAnnotatorDebugger.out.printf("*** END :%s ***%n", query); return annotations; } /** * @param relatedSearchRes * the related search suggested in the first query to Bing. * @param query * the input query. * @return the best related search for Source 4. */ private static String getRelatedSearch(List<String> relatedSearchRes, String query) { if (relatedSearchRes.isEmpty()) return null; List<String> qTokens = SmaphUtils.tokenize(query); List<String> rsTokens = SmaphUtils.tokenize(relatedSearchRes.get(0)); String newSearch = ""; int insertedTokens = 0; for (String rsToken : rsTokens) for (String qToken : qTokens) if (SmaphUtils.getNormEditDistance(qToken, rsToken) < 0.5) { newSearch += rsToken + " "; insertedTokens++; break; } if (insertedTokens == 0) return null; if (newSearch.isEmpty()) return null; if (newSearch.charAt(newSearch.length() - 1) == ' ') newSearch = newSearch.substring(0, newSearch.length() - 1); return newSearch; } /** * Adjust the title of retrieved Wikipedia pages, e.g. removing final * parenthetical. * * @param rankToIdWS * a mapping from a rank (position in the search engine result) * to the Wikipedia ID of the page in that rank. * @return a mapping from adjusted titles to a pair <wid, rank> */ private HashMap<String, Pair<Integer, Integer>> adjustTitles( HashMap<Integer, Integer> rankToIdWS) { HashMap<String, Pair<Integer, Integer>> res = new HashMap<>(); for (int rank : rankToIdWS.keySet()) { int wid = rankToIdWS.get(rank); try { String title = wikiApi.getTitlebyId(wid); if (title != null) { title = title.replaceAll(WIKITITLE_ENDPAR_REGEX, ""); res.put(title, new Pair<Integer, Integer>(wid, rank)); } } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(); } } return res; } /** * Concatenates the bolds passed by argument. * * @param bolds * @return the list of concatenated bolds and the set of their mentions. */ private static Pair<String, HashSet<Mention>> concatenateBolds( List<String> bolds) { HashSet<Mention> mentions = new HashSet<Mention>(); String concat = ""; for (String spot : bolds) { int mentionStart = concat.length(); int mentionEnd = mentionStart + spot.length() - 1; mentions.add(new Mention(mentionStart, mentionEnd - mentionStart + 1)); concat += spot + " "; } return new Pair<String, HashSet<Mention>>(concat, mentions); } /** * @param bingReply * Bing's reply. * @return whether the query to bing failed and has to be re-issued. * @throws JSONException * if the Bing result could not be read. */ private static boolean recacheNeeded(JSONObject bingReply) throws JSONException { if (bingReply == null) return true; JSONObject data = (JSONObject) bingReply.get("d"); if (data == null) return true; JSONObject results = (JSONObject) ((JSONArray) data.get("results")) .get(0); if (results == null) return true; JSONArray webResults = (JSONArray) results.get("Web"); if (webResults == null) return true; if (((String) results.get("WebTotal")).equals("")) return true; return false; } /** * Issue a query to Bing and extract the result. * * @param query * the query to be issued to Bing * @param boldsAndRanks * storage for the bolds (a pair <bold, rank> means bold * appeared in the snippets of the result in position rank) * @param urls * storage for the urls found by Bing. * @param relatedSearch * storage for the "related search" suggestions. * @param snippetsToBolds * storage for the list of pairs <snippets, the bolds found in * that snippet> * @return a triple <results, webTotal, bingReply> where results is * the number of results returned by Bing, webTotal is the number of * pages found by Bing, and bingReply is the raw Bing reply. * @param topk * limit to top-k results. * @param wikisearch * whether to append the word "wikipedia" to the query or not. * @throws Exception * if something went wrong while querying Bing. */ private Triple<Integer, Double, JSONObject> takeBingData(String query, List<Pair<String, Integer>> boldsAndRanks, List<String> urls, List<String> relatedSearch, List<Pair<String, Vector<Pair<Integer, Integer>>>> snippetsToBolds, int topk, boolean wikisearch) throws Exception { if (!boldsAndRanks.isEmpty()) throw new RuntimeException("boldsAndRanks must be empty"); if (!urls.isEmpty()) throw new RuntimeException("urls must be empty"); if (wikisearch) query += " wikipedia"; JSONObject bingReply = queryBing(query, BING_RETRY); JSONObject data = (JSONObject) bingReply.get("d"); JSONObject results = (JSONObject) ((JSONArray) data.get("results")) .get(0); JSONArray webResults = (JSONArray) results.get("Web"); double webTotal = new Double((String) results.get("WebTotal")); getBoldsAndUrls(webResults, topk, boldsAndRanks, urls, snippetsToBolds); if (relatedSearch != null) { JSONArray relatedSearchResults = (JSONArray) results .get("RelatedSearch"); for (int i = 0; i < relatedSearchResults.length(); i++) { JSONObject resI = (JSONObject) relatedSearchResults.get(i); String rsI = (String) resI.get("Title"); relatedSearch.add(rsI); } } return new ImmutableTriple<Integer, Double, JSONObject>( webResults.length(), webTotal, bingReply); } /** * Turns a Wikipedia URL to the title of the Wikipedia page. * * @param encodedWikiUrl * @return a Wikipedia title, or null if the url is not a Wikipedia page. */ private static String decodeWikiUrl(String encodedWikiUrl) { if (!encodedWikiUrl.matches("^" + WIKI_URL_LEADING + ".*")) { return null; } try { String title = URLDecoder.decode( encodedWikiUrl.substring(WIKI_URL_LEADING.length()), "utf-8"); if (!SmaphUtils.acceptWikipediaTitle(title)) return null; return title; } catch (IllegalArgumentException | UnsupportedEncodingException e) { return null; } } /** * From the bing results extract the bolds and the urls. * * @param webResults * the web results returned by Bing. * @param topk * limit the extraction to the first topk results. * @param boldsAndRanks * storage for the bolds and their rank. * @param urls * storage for the result URLs. * @param snippetsToBolds * storage for the list of pairs <snippets, the bolds found in * that snippet> * @throws JSONException * if the json returned by Bing could not be read. */ private static void getBoldsAndUrls(JSONArray webResults, double topk, List<Pair<String, Integer>> boldsAndRanks, List<String> urls, List<Pair<String, Vector<Pair<Integer, Integer>>>> snippetsToBolds) throws JSONException { for (int i = 0; i < Math.min(webResults.length(), topk); i++) { String snippet = ""; JSONObject resI = (JSONObject) webResults.get(i); String descI = (String) resI.get("Description"); String url = (String) resI.get("Url"); urls.add(url); byte[] startByte = new byte[] { (byte) 0xee, (byte) 0x80, (byte) 0x80 }; byte[] stopByte = new byte[] { (byte) 0xee, (byte) 0x80, (byte) 0x81 }; String start = new String(startByte); String stop = new String(stopByte); descI = descI.replaceAll(stop + "." + start, " "); int startIdx = descI.indexOf(start); int stopIdx = descI.indexOf(stop, startIdx); int lastStop = -1; Vector<Pair<Integer, Integer>> boldPosInSnippet = new Vector<>(); while (startIdx != -1 && stopIdx != -1) { String spot = descI.subSequence(startIdx + 1, stopIdx) .toString(); boldsAndRanks.add(new Pair<String, Integer>(spot, i)); SmaphAnnotatorDebugger.out.printf("Rank:%d Bold:%s%n", i, spot); snippet += descI.substring(lastStop + 1, startIdx); boldPosInSnippet.add(new Pair<Integer, Integer>(snippet .length(), spot.length())); snippet += spot; lastStop = stopIdx; startIdx = descI.indexOf(start, startIdx + 1); stopIdx = descI.indexOf(stop, startIdx + 1); } snippet += descI.substring(lastStop + 1); if (snippetsToBolds != null) snippetsToBolds .add(new Pair<String, Vector<Pair<Integer, Integer>>>( snippet, boldPosInSnippet)); } } /** * Issue the query to bing, return the json object. * * @param query * the query. * @param retryLeft * how many retry left we have (if zero, will return an empty * object in case of failure). * @return the JSON object as returned by the Bing Api. * @throws Exception * is the call to the API failed. */ private synchronized JSONObject queryBing(String query, int retryLeft) throws Exception { boolean forceCacheOverride = retryLeft < BING_RETRY; if (forceCacheOverride) Thread.sleep(1000); String accountKeyAuth = Base64.encode( (bingKey + ":" + bingKey).getBytes(), 0); URL url = new URL( "https://api.datamarket.azure.com/Bing/Search/v1/Composite?Sources=%27web%2Bspell%2BRelatedSearch%27&Query=%27" + URLEncoder.encode(query, "utf8") + "%27&Options=%27EnableHighlighting%27&Market=%27en-US%27&Adult=%27Off%27&$format=Json"); JSONObject result = null; byte[] compressed = url2jsonCache.get(url.toExternalForm()); if (compressed != null) result = new JSONObject(SmaphUtils.decompress(compressed)); boolean cached = !forceCacheOverride && result != null; SmaphAnnotatorDebugger.out.printf("%s%s %s%n", forceCacheOverride ? "<forceCacheOverride>" : "", cached ? "<cached>" : "Querying", url); if (!cached) { HttpURLConnection connection = (HttpURLConnection) url .openConnection(); connection.setConnectTimeout(0); connection.setRequestProperty("Authorization", "Basic " + accountKeyAuth); connection.setRequestProperty("Accept", "*/*"); connection .setRequestProperty("Content-Type", "multipart/form-data"); connection.setUseCaches(false); if (connection.getResponseCode() != 200) { Scanner s = new Scanner(connection.getErrorStream()) .useDelimiter("\\A"); System.err.printf("Got HTTP error %d. Message is: %s%n", connection.getResponseCode(), s.next()); s.close(); throw new RuntimeException("Got response code:" + connection.getResponseCode()); } Scanner s = new Scanner(connection.getInputStream()) .useDelimiter("\\A"); String resultStr = s.hasNext() ? s.next() : ""; result = new JSONObject(resultStr); url2jsonCache.put(url.toExternalForm(), SmaphUtils.compress(result.toString())); increaseFlushCounter(); } if (recacheNeeded(result) && retryLeft > 0) return queryBing(query, retryLeft - 1); return result; } /** * Set the file to which the Bing responses cache is bound. * * @param cacheFilename * the cache file name. * @throws FileNotFoundException * if the file could not be open for reading. * @throws IOException * if something went wrong while reading the file. * @throws ClassNotFoundException * is the file contained an object of the wrong class. */ public static void setCache(String cacheFilename) throws FileNotFoundException, IOException, ClassNotFoundException { if (resultsCacheFilename != null && resultsCacheFilename.equals(cacheFilename)) return; System.out.println("Loading bing cache..."); resultsCacheFilename = cacheFilename; if (new File(resultsCacheFilename).exists()) { ObjectInputStream ois = new ObjectInputStream(new FileInputStream( resultsCacheFilename)); url2jsonCache = (HashMap<String, byte[]>) ois.readObject(); ois.close(); } } /** * Add all records contained in the cache passed by argument to the static * cache, overwriting in case of conflicting keys. * * @param newCache * the cache whose records are added. */ public static void mergeCache(HashMap<String, byte[]> newCache) { for (String key : newCache.keySet()) { url2jsonCache.put(key, newCache.get(key)); flushCounter++; } } /** * Clear the Bing response cache and call the garbage collector. */ public static void unSetCache() { url2jsonCache = new HashMap<>(); System.gc(); } /** * Given a list of urls, creates a mapping from the url position to the * Wikipedia page ID of that URL. If an url is not a Wikipedia url, no * mapping is added. * * @param urls * a list of urls. * @return a mapping from position to Wikipedia page IDs. */ private HashMap<Integer, Integer> urlsToRankID(List<String> urls) { HashMap<Integer, Integer> result = new HashMap<>(); HashMap<Integer, String> rankToTitle = new HashMap<>(); for (int i = 0; i < urls.size(); i++) { String title = decodeWikiUrl(urls.get(i)); if (title != null) rankToTitle.put(i, title); } try { wikiApi.prefetchTitles(new Vector<String>(rankToTitle.values())); } catch (XPathExpressionException | IOException | ParserConfigurationException | SAXException e) { throw new RuntimeException(e); } for (int rank : rankToTitle.keySet()) { int wid; try { wid = wikiApi.getIdByTitle(rankToTitle.get(rank)); } catch (IOException e) { throw new RuntimeException(e); } if (wid != -1) { result.put(rank, wid); SmaphAnnotatorDebugger.out.printf( "Found Wikipedia url:%s rank:%d id:%d%n", urls.get(rank), rank, wid); } else SmaphAnnotatorDebugger.out.printf( "Discarding Wikipedia url:%s rank:%d id:%d%n", urls.get(rank), rank, wid); } return result; } /** * Generates the Entity Selection features for an entity drawn from Source 1 * (Annotator) * * @param query * the query that has been issued to Bing. * @param resultsCount * the number of results contained in the Bing response. * @param ann * the annotation from which the URL is extracted. * @param annInput * the input that has been passed to the auxiliary annotator. * @param bingBolds * the list of bolds spotted by Bing plus their position. * @param additionalInfo * additional info returned by the annotator. * @return a mapping between feature name and its value. */ private HashMap<String, Double> generateEntitySelectionFeaturesAnnotator( String query, int resultsCount, Annotation ann, Pair<String, HashSet<Mention>> annInput, List<Pair<String, Integer>> bingBolds, HashMap<String, HashMap<String, Double>> additionalInfo) { HashMap<String, Double> result = new HashMap<>(); String bold = annInput.first.substring(ann.getPosition(), ann.getPosition() + ann.getLength()); result.put("is_s1", 1.0); result.put("s1_freq", FrequencyBoldFilter.getFrequency(bingBolds, bold, resultsCount)); result.put("s1_avgRank", RankWeightBoldFilter.getAvgRank(bingBolds, bold, resultsCount)); result.put("s1_editDistance", SmaphUtils.getMinEditDist(query, bold)); // Add additional info like rho, commonness, etc. for (String key : additionalInfo.get(bold).keySet()) result.put("s1_" + key, additionalInfo.get(bold).get(key)); return result; } /** * Generates the Entity Selection features for an entity drawn from Source 2 * (Normal Search) * * @param query * the query that has been issued to Bing. * @param wid * the Wikipedia page ID of the entity. * @param rank * the position in which the entity appeared in the Bing results. * @param wikiWebTotal * total web results found by Bing for the Wikisearch. * @param webTotal * total web results found by Bing for the normal search. * @param bingBoldsWS * the list of bolds spotted by Bing for the Wikisearch plus their position. * @param source * Source id (3 for WikiSearch) * @return a mapping between feature name and its value. */ private HashMap<String, Double> generateEntitySelectionFeaturesSearch( String query, int wid, int rank, double webTotal, double wikiWebTotal, List<Pair<String, Integer>> bingBoldsWS, int source) { String sourceName = "s" + source; HashMap<String, Double> result = new HashMap<>(); result.put("is_" + sourceName, 1.0); result.put(sourceName + "_rank", (double) rank); result.put(sourceName + "_webTotal", (double) webTotal); result.put(sourceName + "_wikiWebTotal", (double) wikiWebTotal); String title; try { title = wikiApi.getTitlebyId(wid); } catch (IOException e) { throw new RuntimeException(e); } result.put(sourceName + "_editDistanceTitle", SmaphUtils.getMinEditDist(query, title)); result.put( sourceName + "_editDistanceNoPar", SmaphUtils.getMinEditDist(query, title.replaceAll(WIKITITLE_ENDPAR_REGEX, ""))); double minEdDist = 1.0; double capitalized = 0; double avgNumWords = 0; int boldsCount = 0; for (Pair<String, Integer> p : bingBoldsWS) if (p.second == rank) { boldsCount++; minEdDist = Math.min(minEdDist, SmaphUtils.getMinEditDist(query, p.first)); if (Character.isUpperCase(p.first.charAt(0))) capitalized++; avgNumWords += p.first.split("\\W+").length; } if (boldsCount != 0) avgNumWords /= boldsCount; result.put(sourceName + "_editDistanceBolds", minEdDist); result.put(sourceName + "_capitalizedBolds", capitalized); result.put(sourceName + "_avgBoldsWords", avgNumWords); return result; } /** * Given a query and its gold standard, generate * * @param query * a query. * @param goldStandard * the entities associated to the query. * @param posEFVectors * where to store the positive-example (true positives) feature * vectors. * @param negEFVectors * where to store the negative-example (false positives) feature * vectors. * @param discardNE * whether to limit the output to named entities, as defined by * ERDDatasetFilter.EntityIsNE. * @param wikiToFreeb * a wikipedia to freebase-id mapping. * @throws Exception * if something went wrong while annotating the query. */ public void generateExamples(String query, HashSet<Tag> goldStandard, Vector<double[]> posEFVectors, Vector<double[]> negEFVectors, boolean discardNE, WikipediaToFreebase wikiToFreeb) throws Exception { /** Search the query on bing */ List<Pair<String, Integer>> bingBoldsAndRankNS = null; List<String> urls = null; List<String> relatedSearchRes = null; Triple<Integer, Double, JSONObject> resCountAndWebTotal = null; int resultsCount = -1; double webTotalNS = Double.NaN; List<String> filteredBolds = null; HashMap<Integer, Integer> rankToIdNS = null; if (includeSourceAnnotator || includeSourceWikiSearch || includeSourceRelatedSearch || includeSourceNormalSearch) { bingBoldsAndRankNS = new Vector<>(); urls = new Vector<>(); relatedSearchRes = new Vector<>(); resCountAndWebTotal = takeBingData(query, bingBoldsAndRankNS, urls, relatedSearchRes, null, Integer.MAX_VALUE, false); resultsCount = resCountAndWebTotal.getLeft(); webTotalNS = resCountAndWebTotal.getMiddle(); filteredBolds = boldFilter.filterBolds(query, bingBoldsAndRankNS, resultsCount); rankToIdNS = urlsToRankID(urls); if (debugger != null) { debugger.addBoldPositionEditDistance(query, bingBoldsAndRankNS); debugger.addBoldFilterOutput(query, filteredBolds); debugger.addSource2SearchResult(query, rankToIdNS, urls); debugger.addBingResponseNormalSearch(query, resCountAndWebTotal.getRight()); } } /** Do the wikipedia-search on bing. */ List<String> wikiSearchUrls = new Vector<>(); List<Pair<String, Integer>> bingBoldsAndRankWS = new Vector<>(); HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankWS = null; Triple<Integer, Double, JSONObject> resCountAndWebTotalWS = null; double webTotalWS = Double.NaN; if (includeSourceWikiSearch | includeSourceNormalSearch) { resCountAndWebTotalWS = takeBingData(query, bingBoldsAndRankWS, wikiSearchUrls, null, null, topKWikiSearch, true); webTotalWS = resCountAndWebTotalWS.getMiddle(); HashMap<Integer, Integer> rankToIdWikiSearch = urlsToRankID(wikiSearchUrls); if (debugger != null) { debugger.addSource3SearchResult(query, rankToIdWikiSearch, wikiSearchUrls); debugger.addBingResponseWikiSearch(query, resCountAndWebTotal.getRight()); } annTitlesToIdAndRankWS = adjustTitles(rankToIdWikiSearch); } /** Do the RelatedSearch on bing */ String relatedSearch = null; List<String> relatedSearchUrls = null; List<Pair<String, Integer>> bingBoldsAndRankRS = null; HashMap<Integer, Integer> rankToIdRelatedSearch = null; HashMap<String, Pair<Integer, Integer>> annTitlesToIdAndRankRS = null; double webTotalRelatedSearch = Double.NaN; if (includeSourceRelatedSearch) { relatedSearch = getRelatedSearch(relatedSearchRes, query); relatedSearchUrls = new Vector<>(); bingBoldsAndRankRS = new Vector<>(); Triple<Integer, Double, JSONObject> resCountAndWebTotalRS = takeBingData( query, bingBoldsAndRankRS, relatedSearchUrls, null, null, topKRelatedSearch, false); webTotalRelatedSearch = resCountAndWebTotalRS.getMiddle(); rankToIdRelatedSearch = urlsToRankID(relatedSearchUrls); annTitlesToIdAndRankRS = adjustTitles(rankToIdRelatedSearch); } /** Annotate bolds on the annotator */ Pair<HashMap<String, HashMap<String, Double>>, HashMap<String, Annotation>> infoAndAnnotations = null; HashMap<String, Annotation> spotToAnnotation = null; HashMap<String, HashMap<String, Double>> additionalInfo = null; Pair<String, HashSet<Mention>> annInput = null; if (includeSourceAnnotator) { annInput = concatenateBolds(filteredBolds); infoAndAnnotations = disambiguateBolds(annInput.first, annInput.second); spotToAnnotation = infoAndAnnotations.second; additionalInfo = infoAndAnnotations.first; if (debugger != null) debugger.addReturnedAnnotation(query, spotToAnnotation); } List<Pair<Tag, HashMap<String, Double>>> widToEFFtrVect = new Vector<>(); // Filter and add annotations found by the disambiguator if (includeSourceAnnotator) { for (String bold : filteredBolds) { if (spotToAnnotation.containsKey(bold)) { Annotation ann = spotToAnnotation.get(bold); HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesAnnotator( query, resultsCount, ann, annInput, bingBoldsAndRankNS, additionalInfo); Tag tag = new Tag(ann.getConcept()); widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>( tag, ESFeatures)); } } } // Filter and add entities found in the normal search if (includeSourceNormalSearch) { for (int rank : rankToIdNS.keySet()) { int wid = rankToIdNS.get(rank); HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch( query, wid, rank, webTotalNS, webTotalWS, bingBoldsAndRankNS, 2); Tag tag = new Tag(wid); widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>(tag, ESFeatures)); } } // Filter and add entities found in the WikipediaSearch if (includeSourceWikiSearch) { for (String annotatedTitleWS : annTitlesToIdAndRankWS.keySet()) { int wid = annTitlesToIdAndRankWS.get(annotatedTitleWS).first; int rank = annTitlesToIdAndRankWS.get(annotatedTitleWS).second; HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch( query, wid, rank, webTotalNS, webTotalWS, bingBoldsAndRankWS, 3); Tag tag = new Tag(wid); widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>(tag, ESFeatures)); } } // Filter and add entities found in the RelatedSearch if (includeSourceRelatedSearch) { for (String annotatedTitleRS : annTitlesToIdAndRankRS.keySet()) { int wid = annTitlesToIdAndRankRS.get(annotatedTitleRS).first; int rank = annTitlesToIdAndRankRS.get(annotatedTitleRS).second; HashMap<String, Double> ESFeatures = generateEntitySelectionFeaturesSearch( relatedSearch, wid, rank, webTotalNS, webTotalRelatedSearch, bingBoldsAndRankRS, 5); Tag tag = new Tag(wid); widToEFFtrVect.add(new Pair<Tag, HashMap<String, Double>>(tag, ESFeatures)); } } for (Pair<Tag, HashMap<String, Double>> tagAndFtrs : widToEFFtrVect) { Tag tag = tagAndFtrs.first; HashMap<String, Double> ftrs = tagAndFtrs.second; if (discardNE && !ERDDatasetFilter.EntityIsNE(wikiApi, wikiToFreeb, tag.getConcept())) continue; if (goldStandard.contains(tag)) posEFVectors.add(LibSvmEntityFilter .featuresToFtrVectStatic(ftrs)); else negEFVectors.add(LibSvmEntityFilter .featuresToFtrVectStatic(ftrs)); System.out.printf("%d in query [%s] is a %s example.%n", tag .getConcept(), query, goldStandard.contains(tag) ? "positive" : "negative"); } } }