package com.personalityextractor.entity.resolver; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.StringReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import tathya.db.YahooBOSS; import com.personalityextractor.data.source.Wikiminer; import com.personalityextractor.entity.Entity; import com.personalityextractor.entity.WikipediaEntity; import cs224n.util.PriorityQueue; /** * @author akishore * */ public class WikiMinerEntityResolver extends BaseEntityResolver { public boolean breakTies(String bigger_entity_id, String smaller_entity){ String xml = null; if((xml = Wikiminer.getXML(bigger_entity_id, true)) != null) { ArrayList<String> links = getLinks(xml); for(String s : links){ if(smaller_entity.equalsIgnoreCase(s)){ return true; } } } return false; } /* * a basic method to resolve ambiguity between entities one of which is a *bigger entity* of all the other. * some example cases: 'New' or 'New York'?, 'short' or 'short story', '' */ private ArrayList<String> removeExtraneousEntities(ArrayList<String> entities){ ArrayList<String> extraneous = new ArrayList<String>(); HashMap<String, ArrayList<String>> sortedEntities = new HashMap<String, ArrayList<String>>(); for(String entity : entities){ String[] split = entity.split("\\s+"); String split_length = Integer.toString(split.length); if(sortedEntities.containsKey(split_length)){ sortedEntities.get(split_length).add(entity); } else{ ArrayList<String> arr = new ArrayList<String>(); arr.add(entity); sortedEntities.put(split_length, arr); } } Object[] keys = sortedEntities.keySet().toArray(); Arrays.sort(keys); for(int i = keys.length-1; i >=0; i--){ ArrayList<String> arr = sortedEntities.get(keys[i]); for(String entity : arr){ if(extraneous.contains(entity)) continue; String xml = ""; if((xml = Wikiminer.getXML(entity, false)) != null) { String id = getHighestSenseID(xml); if(id==null){ continue; } /* * the higher entity has passed the test- so remove all smaller entities which are not present in the links for this bigger entity */ for(int j=i-1; j >=0; j--){ ArrayList<String> arr_smaller = sortedEntities.get(keys[j]); for(String entity_small: arr_smaller){ if(!breakTies(id, entity_small)){ extraneous.add(entity_small); } } } } } } for(String s: extraneous){ entities.remove(s); } return entities; } private String getHighestSenseID(String xml){ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = null; try { db = dbf.newDocumentBuilder(); } catch (Exception e) { e.printStackTrace(); } try { InputSource is = new InputSource(); is.setCharacterStream(new StringReader(xml)); // System.out.println(xml); Document dom = db.parse(is); NodeList senseNodes = dom.getElementsByTagName("Sense"); Node topSense = senseNodes.item(0); if(topSense != null) { NamedNodeMap attrs = topSense.getAttributes(); Node commonness = attrs.getNamedItem("commonness"); try { double relevance = Double.parseDouble(commonness.getTextContent()); if(relevance >= 0.70) { return attrs.getNamedItem("id").getTextContent(); } } catch (Exception e) { e.printStackTrace(); } } else { NodeList articleNodes = dom.getElementsByTagName("Article"); if(articleNodes != null && articleNodes.item(0) != null) { NamedNodeMap attrs = articleNodes.item(0).getAttributes(); return attrs.getNamedItem("id").getTextContent(); } } } catch (Exception e) { e.printStackTrace(); } return null; } private boolean checkThreshold(String xml){ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = null; try { db = dbf.newDocumentBuilder(); } catch (Exception e) { return false; } try { InputSource is = new InputSource(); is.setCharacterStream(new StringReader(xml)); Document dom = db.parse(is); NodeList senseNodes = dom.getElementsByTagName("Sense"); Node topSense = senseNodes.item(0); if(topSense != null) { NamedNodeMap attrs = topSense.getAttributes(); Node commonness = attrs.getNamedItem("commonness"); try { double relevance = Double.parseDouble(commonness.getTextContent()); if(relevance >= 0.70) { return true; } } catch (Exception e) { e.printStackTrace(); } } else { NodeList articleNodes = dom.getElementsByTagName("Article"); if(articleNodes != null && articleNodes.item(0) != null) { return true; } } } catch (Exception e) { e.printStackTrace(); } return false; } private ArrayList<String> getLinks(String xml){ ArrayList<String> links = new ArrayList<String>(); DocumentBuilder db = null; DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); try { db = dbf.newDocumentBuilder(); } catch (Exception e) { e.printStackTrace(); } InputSource is = new InputSource(); is.setCharacterStream(new StringReader(xml)); try { Document dom = db.parse(is); NodeList outNodes = dom.getElementsByTagName("LinkOut"); if (outNodes != null && outNodes.getLength() != 0) { for (int i = 0; i < outNodes.getLength(); i++) { Node link = outNodes.item(i); if (link != null) { NamedNodeMap attrs = link.getAttributes(); Node commonness = attrs.getNamedItem("relatedness"); double relevance = Double.parseDouble(commonness .getTextContent()); if (relevance >= 0.1) { links.add(attrs.getNamedItem("title") .getTextContent()); } } } } NodeList inNodes = dom.getElementsByTagName("LinkIn"); if (inNodes != null && inNodes.getLength() != 0) { for (int i = 0; i < inNodes.getLength(); i++) { Node link = inNodes.item(i); if (link != null) { NamedNodeMap attrs = link.getAttributes(); Node commonness = attrs.getNamedItem("relatedness"); double relevance = Double.parseDouble(commonness .getTextContent()); if (relevance >= 0.1) { links.add(attrs.getNamedItem("title") .getTextContent()); } } } } } catch (Exception e) { e.printStackTrace(); } return links; } private static ArrayList<String> extractEntities(ArrayList<String> words) { ArrayList<String> entities = new ArrayList<String>(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = null; try { db = dbf.newDocumentBuilder(); } catch (Exception e) { return entities; } for (int i = 0; i < words.size(); i++) { String entity = words.get(i).trim(); String xml = null; if ((xml = Wikiminer.getXML(entity, false)) != null) { try { InputSource is = new InputSource(); is.setCharacterStream(new StringReader(xml)); Document dom = db.parse(is); NodeList senseNodes = dom.getElementsByTagName("Sense"); Node topSense = senseNodes.item(0); if (topSense != null) { NamedNodeMap attrs = topSense.getAttributes(); Node commonness = attrs.getNamedItem("commonness"); try { double relevance = Double.parseDouble(commonness.getTextContent()); if (relevance >= 0.70) { entities.add(entity); } } catch (Exception e) { } } else { NodeList articleNodes = dom.getElementsByTagName("Article"); if (articleNodes != null && articleNodes.item(0) != null) { entities.add(entity); } } } catch (Exception e) { e.printStackTrace(); } } } return entities; } public static List<WikipediaEntity> getRankedEntities(String entity, List<String> contextPhrases){ List<WikipediaEntity> rankedEntities = new ArrayList<WikipediaEntity>(); PriorityQueue<WikipediaEntity> queue = new PriorityQueue<WikipediaEntity>(); contextPhrases.remove(entity); StringBuffer contextQuery = new StringBuffer(); for (String c : contextPhrases) { contextQuery.append("\"" + c + "\"" + " "); } int contextCount = YahooBOSS.makeQuery(contextQuery.toString()); String xml = ""; if((xml=Wikiminer.getXML(entity, false))!=null){ ArrayList<String[]> senses = Wikiminer.getWikipediaSenses(xml, true); for(String[] senseArr : senses){ int senseCount = YahooBOSS.makeQuery('"' + senseArr[0] + "\" "+contextQuery.toString()); WikipediaEntity we = new WikipediaEntity(senseArr[0],senseArr[1], -1); queue.add(we, ((double) senseCount / (double) contextCount)); } } while(queue.hasNext()){ rankedEntities.add(queue.next()); } return rankedEntities; } public static List<String> getRankedTypes(String entity, String xml, List<String> contextPhrases, int numTypes){ int entityCount = YahooBOSS.makeQuery('"' + entity + '"'); StringBuffer contextQuery = new StringBuffer(); for (String c : contextPhrases) { contextQuery.append("\"" + c + "\"" + " "); } List<String> rankedCategories = new ArrayList<String>(); PriorityQueue<String> queue = new PriorityQueue<String>(); DocumentBuilder db = null; DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); try { db = dbf.newDocumentBuilder(); } catch (Exception e) { e.printStackTrace(); } InputSource is = new InputSource(); is.setCharacterStream(new StringReader(xml)); try { Document dom = db.parse(is); NodeList senseNodes = dom.getElementsByTagName("Category"); if (senseNodes != null && senseNodes.getLength() != 0) { for (int i = 0; i < senseNodes.getLength(); i++) { Node topSense = senseNodes.item(i); if(topSense!=null){ NamedNodeMap attrs = topSense.getAttributes(); String type = attrs.getNamedItem("title").getTextContent(); int count = YahooBOSS.makeQuery("\""+type+"\" \""+entity+"\" "+contextPhrases.toString()); queue.add(type, ((double)count/(double)entityCount)); } } } while(queue.hasNext() && numTypes > 0 ){ numTypes--; rankedCategories.add(queue.next()); } } catch (Exception e) { e.printStackTrace(); } return rankedCategories; } public List<WikipediaEntity> resolve(List<String> entities) { ArrayList<WikipediaEntity> resolvedEntities = new ArrayList<WikipediaEntity>(); List<String> contextPhrases = new ArrayList<String>(entities); for(String entity : entities){ List<WikipediaEntity> rankedEntities = getRankedEntities(entity, contextPhrases); for(WikipediaEntity we : rankedEntities){ String xml = getXML(we.getWikiminerID(), true); for(String s: getRankedTypes(we.getText(), xml, contextPhrases, 5)){ we.addCategory(s); } resolvedEntities.add(we); } } return resolvedEntities; } private static String getXML(String query, boolean isId) { try { String urlStr = "http://wdm.cs.waikato.ac.nz:8080/service?task=search&xml"; if(isId) { urlStr += "&id=" + query; } else { urlStr += "&term=" + query; } URL url = new URL(urlStr); URLConnection yc = url.openConnection(); BufferedReader in = new BufferedReader( new InputStreamReader( yc.getInputStream())); String inputLine; StringBuffer buf = new StringBuffer(); while ((inputLine = in.readLine()) != null) buf.append(inputLine); in.close(); if(buf.toString().contains("unknownTerm")) { return null; } return buf.toString(); } catch(Exception e) { e.printStackTrace(); } return null; } /** * @param args */ public static void main(String[] args) { // WikiMinerEntityResolver pme = new WikiMinerEntityResolver(EntityExtractFactory.produceExtractor(Extracter.CONSECUTIVE_WORDS)); // ArrayList<String> entities = new ArrayList<String>(); // entities.add("York City"); // entities.add("City"); // entities.add("New York City"); // entities.add("New York"); // System.out.println(pme.removeExtraneousEntities(entities)); // // entities = new ArrayList<String>(); // entities.add("short story"); // entities.add("story"); // entities.add("short"); // System.out.println(pme.removeExtraneousEntities(entities)); // ArrayList<String> entities = pme.extract("Will soon be en route New York to Frankfurt."); // for(String e : entities) { // System.out.println(e); // } // for(WikipediaEntity we: resolveEntities(entities)){ // we.print(); // } // pme.extract("Will soon be en route amman to Frankfurt."); // pme.extract("New model of the universe fits data better than Big Bang"); //pme.extract("RT @LanceWeiler: Hubs & Connectors: Understanding Networks Through Data Visualization http://bit.ly/eegRqT HT @JeffClark"); //System.out.println(pme.extract("About to embark on the unthinkable... Driving to New York City. Wish me luck.")); // try{ // BufferedReader br = new BufferedReader(new FileReader(args[0])); // String line = ""; // while((line=br.readLine())!=null){ // line=line.replace("<E>", ""); // line=line.replace("</E>", ""); // System.out.println(line+"\n"+pme.extract(line)); // } // // }catch(Exception e){ // e.printStackTrace(); // } } }