package com.personalityextractor.data.source;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import com.personalityextractor.entity.WikipediaEntity;
import tathya.db.YahooBOSS;
import cs224n.util.Counter;
import cs224n.util.CounterMap;
import cs224n.util.PriorityQueue;
public class Wikiminer {
static HashMap<String, String> cache = new HashMap<String, String>();
static HashMap<String, HashMap<String, WikipediaEntity>> cacheRelativeBestSenses = new HashMap<String, HashMap<String, WikipediaEntity>>();
// caution ahead: mix up the order of the args at your own peril
public static double calculateJaccard(List<String> links,
List<String> contextPhrases) {
StringBuffer arr1String = new StringBuffer();
HashSet<String> union = new HashSet<String>();
HashSet<String> cphrases = new HashSet<String>();
for (String s : contextPhrases) {
cphrases.add(s);
}
double overlap = 0.0;
for (String s : links) {
arr1String.append(s + " ");
union.add(s);
}
// System.out.println(arr1String);
String arr1Concat = arr1String.toString().trim();
for (String s : cphrases) {
if (arr1Concat.contains(s)) {
overlap++;
}
union.add(s);
}
return (overlap / union.size());
}
public static double compareArticlesWithJaccard(String id1, String id2) {
double overlap = 0.0;
ArrayList<String> links1 = getLinks(id1);
ArrayList<String> links2 = getLinks(id2);
HashSet<String> union = new HashSet<String>();
for (String s : links1) {
union.add(s);
if (links2.contains(s)) {
overlap++;
}
}
for (String s : links2) {
union.add(s);
}
return (overlap / union.size());
}
public static ArrayList<String> getLinks(String wikiminer_id) {
String xml = getXML(wikiminer_id, true);
ArrayList<String> links = new ArrayList<String>();
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
try {
Document dom = db.parse(is);
NodeList outNodes = dom.getElementsByTagName("LinkOut");
if (outNodes != null && outNodes.getLength() != 0) {
for (int i = 0; i < outNodes.getLength(); i++) {
Node link = outNodes.item(i);
if (link != null) {
NamedNodeMap attrs = link.getAttributes();
Node commonness = attrs.getNamedItem("relatedness");
double relevance = Double.parseDouble(commonness
.getTextContent());
if (relevance >= 0.1) {
links
.add(attrs.getNamedItem("id")
.getTextContent());
}
}
}
}
NodeList inNodes = dom.getElementsByTagName("LinkIn");
if (inNodes != null && inNodes.getLength() != 0) {
for (int i = 0; i < inNodes.getLength(); i++) {
Node link = inNodes.item(i);
if (link != null) {
NamedNodeMap attrs = link.getAttributes();
Node commonness = attrs.getNamedItem("relatedness");
double relevance = Double.parseDouble(commonness
.getTextContent());
if (relevance >= 0.1) {
links
.add(attrs.getNamedItem("id")
.getTextContent());
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return links;
}
public static String getexploreArticleXML(String wikiminer_id) {
try {
String urlStr =
"http://wikipedia-miner.cms.waikato.ac.nz/services/exploreArticle?parentCategories=true";
// String urlStr = "http://ec2-50-19-209-97.compute-1.amazonaws.com:8080/wikipediaminer/services/exploreArticle?parentCategories=true";
urlStr += "&id=" + wikiminer_id;
// return from cache
if (cache.containsKey(urlStr)) {
return cache.get(urlStr);
}
URL url = new URL(urlStr);
URLConnection yc = url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc
.getInputStream()));
String inputLine;
StringBuffer buf = new StringBuffer();
while ((inputLine = in.readLine()) != null)
buf.append(inputLine);
in.close();
String xml = buf.toString();
if (!xml.contains("ParentCategory")) {
return null;
}
cache.put(urlStr, xml);
return xml;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static String getexploreCategoryXML(String wikiminer_id) {
try {
String urlStr =
"http://wikipedia-miner.cms.waikato.ac.nz/services/exploreCategory?parentCategories=true";
// String urlStr = "http://ec2-50-19-209-97.compute-1.amazonaws.com:8080/wikipediaminer/services/exploreCategory?parentCategories=true";
urlStr += "&id=" + wikiminer_id;
// return from cache
if (cache.containsKey(urlStr)) {
return cache.get(urlStr);
}
URL url = new URL(urlStr);
URLConnection yc = url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc
.getInputStream()));
String inputLine;
StringBuffer buf = new StringBuffer();
while ((inputLine = in.readLine()) != null)
buf.append(inputLine);
in.close();
String xml = buf.toString();
if (!xml.contains("ParentCategory")) {
return null;
}
cache.put(urlStr, xml);
return xml;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static ArrayList<WikipediaEntity> getCategories(String wikiminer_id) {
String xml = getexploreArticleXML(wikiminer_id);
ArrayList<WikipediaEntity> categories = new ArrayList<WikipediaEntity>();
if(xml == null) {
return categories;
}
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
try {
Document dom = db.parse(is);
NodeList catNodes = dom.getElementsByTagName("ParentCategory");
if (catNodes != null && catNodes.getLength() != 0) {
for (int i = 0; i < catNodes.getLength(); i++) {
Node cat = catNodes.item(i);
if (cat != null) {
NamedNodeMap attrs = cat.getAttributes();
String[] values = new String[2];
values[0] = attrs.getNamedItem("id").getTextContent();
values[1] = attrs.getNamedItem("title").getTextContent();
categories.add(new WikipediaEntity(values[1],values[0], 1));
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return categories;
}
public static ArrayList<WikipediaEntity> getParentCategories(String wikiminer_id) {
String xml = getexploreCategoryXML(wikiminer_id);
ArrayList<WikipediaEntity> categories = new ArrayList<WikipediaEntity>();
if(xml == null) {
return categories;
}
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
try {
Document dom = db.parse(is);
NodeList catNodes = dom.getElementsByTagName("ParentCategory");
if (catNodes != null && catNodes.getLength() != 0) {
for (int i = 0; i < catNodes.getLength(); i++) {
Node cat = catNodes.item(i);
if (cat != null) {
NamedNodeMap attrs = cat.getAttributes();
String[] values = new String[2];
values[0] = attrs.getNamedItem("id").getTextContent();
values[1] = attrs.getNamedItem("title").getTextContent();
categories.add(new WikipediaEntity(values[1],values[0], 1));
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return categories;
}
public static double getJaccard(String xml, List<String> contextPhrases) {
ArrayList<String> links = new ArrayList<String>();
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
try {
Document dom = db.parse(is);
NodeList outNodes = dom.getElementsByTagName("LinkOut");
if (outNodes != null && outNodes.getLength() != 0) {
for (int i = 0; i < outNodes.getLength(); i++) {
Node link = outNodes.item(i);
if (link != null) {
NamedNodeMap attrs = link.getAttributes();
Node commonness = attrs.getNamedItem("relatedness");
double relevance = Double.parseDouble(commonness
.getTextContent());
if (relevance >= 0.1) {
links.add(attrs.getNamedItem("title")
.getTextContent());
}
}
}
}
NodeList inNodes = dom.getElementsByTagName("LinkIn");
if (inNodes != null && inNodes.getLength() != 0) {
for (int i = 0; i < inNodes.getLength(); i++) {
Node link = inNodes.item(i);
if (link != null) {
NamedNodeMap attrs = link.getAttributes();
Node commonness = attrs.getNamedItem("relatedness");
double relevance = Double.parseDouble(commonness
.getTextContent());
if (relevance >= 0.1) {
links.add(attrs.getNamedItem("title")
.getTextContent());
}
}
}
}
return calculateJaccard(links, contextPhrases);
} catch (Exception e) {
e.printStackTrace();
}
return 0.0;
}
// get the category nodes inside the xml
public static List<String> getRankedTypes(String entity, String xml,
List<String> contextPhrases, int numTypes) {
int entityCount = YahooBOSS.makeQuery('"' + entity + '"');
StringBuffer contextQuery = new StringBuffer();
for (String c : contextPhrases) {
contextQuery.append("\"" + c + "\"" + " ");
}
List<String> rankedCategories = new ArrayList<String>();
PriorityQueue<String> queue = new PriorityQueue<String>();
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
try {
Document dom = db.parse(is);
NodeList senseNodes = dom.getElementsByTagName("Category");
if (senseNodes != null && senseNodes.getLength() != 0) {
for (int i = 0; i < senseNodes.getLength(); i++) {
Node topSense = senseNodes.item(i);
if (topSense != null) {
NamedNodeMap attrs = topSense.getAttributes();
String type = attrs.getNamedItem("title")
.getTextContent();
int count = YahooBOSS.makeQuery("\"" + type + "\" \""
+ entity + "\" " + contextPhrases.toString());
queue
.add(type,
((double) count / (double) entityCount));
}
}
}
while (queue.hasNext() && numTypes > 0) {
numTypes--;
rankedCategories.add(queue.next());
}
} catch (Exception e) {
e.printStackTrace();
}
return rankedCategories;
}
public static WikipediaEntity getHighestSenseEntity(String query){
String xml = getXML(query, false);
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = null;
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
try {
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
// System.out.println(xml);
Document dom = db.parse(is);
NodeList senseNodes = dom.getElementsByTagName("Sense");
Node topSense = senseNodes.item(0);
if(topSense != null) {
NamedNodeMap attrs = topSense.getAttributes();
Node commonness = attrs.getNamedItem("priorProbability");
try {
double relevance = Double.parseDouble(commonness.getTextContent());
if(relevance >= 0.70) {
return new WikipediaEntity(attrs.getNamedItem("title").getTextContent(), attrs.getNamedItem("id").getTextContent(), 1);
}
} catch (Exception e) {
e.printStackTrace();
}
} else {
NodeList articleNodes = dom.getElementsByTagName("Article");
if(articleNodes != null && articleNodes.item(0) != null) {
NamedNodeMap attrs = articleNodes.item(0).getAttributes();
return new WikipediaEntity(attrs.getNamedItem("title").getTextContent(), attrs.getNamedItem("id").getTextContent(), 1);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static ArrayList<WikipediaEntity> getWikipediaEntities(String xml, boolean getId) {
ArrayList<WikipediaEntity> senses = new ArrayList<WikipediaEntity>();
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
// System.out.println(xml);
try {
Document dom = db.parse(is);
NodeList senseNodes = dom.getElementsByTagName("sense");
if (senseNodes != null && senseNodes.getLength() != 0) {
for (int i = 0; i < senseNodes.getLength(); i++) {
Node topSense = senseNodes.item(i);
if (topSense != null) {
NamedNodeMap attrs = topSense.getAttributes();
Node commonness = attrs
.getNamedItem("priorProbability");
if (commonness == null
&& xml.contains("Disambiguation")
&& xml
.contains("This is a disambiguation page")) {
double commonnessScore = (1.0 / senseNodes
.getLength());
String[] senseArray = {
attrs.getNamedItem("title")
.getTextContent(),
attrs.getNamedItem("id").getTextContent(),
String.valueOf(commonnessScore) };
WikipediaEntity we = new WikipediaEntity(
senseArray[0], senseArray[1], 0,
senseArray[2]);
senses.add(we);
continue;
}
double relevance = Double.parseDouble(commonness
.getTextContent());
if (relevance >= 0.01) {
String[] senseArray = {
attrs.getNamedItem("title")
.getTextContent(),
attrs.getNamedItem("id").getTextContent(),
String.valueOf(relevance) };
WikipediaEntity we = new WikipediaEntity(
senseArray[0], senseArray[1], 0,
senseArray[2]);
senses.add(we);
}
}
}
} else {
NodeList articleNodes = dom.getElementsByTagName("Article");
if (articleNodes != null && articleNodes.item(0) != null) {
Node article = articleNodes.item(0);
NamedNodeMap attrs = article.getAttributes();
String[] senseArray = {
attrs.getNamedItem("title").getTextContent(),
attrs.getNamedItem("id").getTextContent(), "1.0" };
WikipediaEntity we = new WikipediaEntity(senseArray[0],
senseArray[1], 0, senseArray[2]);
senses.add(we);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return senses;
}
public static ArrayList<WikipediaEntity> filterSenses(ArrayList<WikipediaEntity> senses){
ArrayList<WikipediaEntity> filteredSenses = new ArrayList<WikipediaEntity>();
return filteredSenses;
}
public static ArrayList<String[]> getWikipediaSenses(String xml,
boolean getId) {
ArrayList<String[]> senses = new ArrayList<String[]>();
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
// System.out.println(xml);
try {
Document dom = db.parse(is);
NodeList senseNodes = dom.getElementsByTagName("sense");
if (senseNodes != null && senseNodes.getLength() != 0) {
for (int i = 0; i < senseNodes.getLength(); i++) {
Node topSense = senseNodes.item(i);
if (topSense != null) {
NamedNodeMap attrs = topSense.getAttributes();
Node commonness = attrs.getNamedItem("commonness");
if (commonness == null
&& xml.contains("Disambiguation")
&& xml
.contains("This is a disambiguation page")) {
double commonnessScore = (1.0 / senseNodes
.getLength());
String[] senseArray = {
attrs.getNamedItem("title")
.getTextContent(),
attrs.getNamedItem("id").getTextContent(),
String.valueOf(commonnessScore) };
senses.add(senseArray);
continue;
}
double relevance = Double.parseDouble(commonness
.getTextContent());
if (relevance >= 0.01) {
String[] senseArray = {
attrs.getNamedItem("title")
.getTextContent(),
attrs.getNamedItem("id").getTextContent(),
String.valueOf(relevance) };
senses.add(senseArray);
}
}
}
} else {
NodeList articleNodes = dom.getElementsByTagName("Article");
if (articleNodes != null && articleNodes.item(0) != null) {
Node article = articleNodes.item(0);
NamedNodeMap attrs = article.getAttributes();
senses.add(new String[] {
attrs.getNamedItem("title").getTextContent(),
attrs.getNamedItem("id").getTextContent(), "1.0" });
}
}
} catch (Exception e) {
e.printStackTrace();
}
return senses;
}
public static String correctEncoding(String query) {
StringBuffer correctEncoding = new StringBuffer();
String[] sensesplit = query.split("\\s+");
if (sensesplit.length > 1)
for (String s : sensesplit) {
correctEncoding.append(s + "%20");
}
if (correctEncoding.length() != 0) {
query = correctEncoding.toString().substring(0,
correctEncoding.toString().length() - 3);
}
return query;
}
/*
* Takes two words and gets the best Wikipedia Entities based on Compare
* score.
*/
public static HashMap<String, WikipediaEntity> getRelativeBestSenses(
String word1, String word2) {
if (word1.startsWith("#"))
word1 = word1.replace("#", "");
if (word2.startsWith("#"))
word2 = word2.replace("#", "");
HashMap<String, WikipediaEntity> wes = new HashMap<String, WikipediaEntity>();
String urlStr = "compare:" + word1 + ":" + word2;
try {
if (cacheRelativeBestSenses.containsKey(urlStr)) {
return cacheRelativeBestSenses.get(urlStr);
} else {
String xml_word1 = getXML(word1, false);
String xml_word2 = getXML(word2, false);
if (xml_word1 == null || xml_word2 == null) {
return null;
}
List<WikipediaEntity> we_word1 = getWikipediaEntities(
xml_word1, false);
List<WikipediaEntity> we_word2 = getWikipediaEntities(
xml_word2, false);
WikipediaEntity bestSense_word1 = null;
WikipediaEntity bestSense_word2 = null;
double bestScore = 0.0;
for (int i = 0; i < we_word1.size(); i++) {
for (int j = 0; j < we_word2.size(); j++) {
double compareScore = compareIds(we_word1.get(i)
.getWikiminerID(), we_word2.get(j)
.getWikiminerID());
if (compareScore > bestScore) {
bestScore = compareScore;
bestSense_word1 = we_word1.get(i);
bestSense_word2 = we_word2.get(j);
}
}
}
wes.put(word1, bestSense_word1);
wes.put(word2, bestSense_word2);
}
} catch (Exception e) {
e.printStackTrace();
}
cacheRelativeBestSenses.put(urlStr, wes);
return wes;
}
public static double compareIds(String id1, String id2) {
// String urlStr = "http://ec2-50-19-209-97.compute-1.amazonaws.com:8080/wikipediaminer/services/compare?&ids1="
// + id1 + "&ids2=" + id2;
String urlStr = "http://wikipedia-miner.cms.waikato.ac.nz/services/compare?&ids1="
+ id1 + "&ids2=" + id2;
try {
if (cache.containsKey(urlStr)) {
return Double.parseDouble(cache.get(urlStr));
}
URL url = new URL(urlStr);
URLConnection yc = url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc
.getInputStream()));
String inputLine;
StringBuffer buf = new StringBuffer();
while ((inputLine = in.readLine()) != null)
buf.append(inputLine);
in.close();
if (buf.toString().contains("unknownTerm")) {
cache.put(urlStr, "0");
return 0.0;
}
// System.out.println(buf.toString());
DocumentBuilder db = null;
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
db = dbf.newDocumentBuilder();
} catch (Exception e) {
e.printStackTrace();
}
// System.out.println();
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(buf.toString()));
Document dom = db.parse(is);
NodeList relatednessNodes = dom.getElementsByTagName("Measure");
Node relation = relatednessNodes.item(0);
if (relation != null) {
String str = relation.getTextContent();
cache.put(urlStr, str);
return Double.parseDouble(str);
}
} catch (Exception e) {
e.printStackTrace();
}
cache.put(urlStr, "0");
return 0.0;
}
public static String getXML(String query, boolean isId) {
if (query.startsWith("#"))
query = query.replace("#", "");
if (query.equalsIgnoreCase("wikipedia entry"))
return null;
String[] sensesplit = query.split("\\s+");
for (int i = 0; i < sensesplit.length; i++) {
String s = sensesplit[i];
if(s.length()<=0)
continue;
if (Character.isLowerCase(s.charAt(0))) {
sensesplit[i] = (Character.toUpperCase(s.charAt(0)) + s.substring(1));
}
}
StringBuffer correctEncoding = new StringBuffer();
if (sensesplit.length > 1)
for (String s : sensesplit) {
correctEncoding.append(s + "%20");
}
if (correctEncoding.length() != 0) {
query = correctEncoding.toString().substring(0,
correctEncoding.toString().length() - 3);
}
try {
String urlStr =
"http://wikipedia-miner.cms.waikato.ac.nz/services/search?&complex=true";
// String urlStr = "http://ec2-50-19-209-97.compute-1.amazonaws.com:8080/wikipediaminer/services/search?";
if (isId) {
urlStr += "&id=" + query;
return null;
} else {
urlStr += "&query=" + query;
}
// return from cache
if (cache.containsKey(urlStr)) {
return cache.get(urlStr);
}
URL url = new URL(urlStr);
URLConnection yc = url.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream()));
String inputLine;
StringBuffer buf = new StringBuffer();
while ((inputLine = in.readLine()) != null)
buf.append(inputLine);
in.close();
String xml = buf.toString();
if (!xml.contains("sense")) {
return null;
}
cache.put(urlStr, xml);
return xml;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static void main(String args[]) {
// ArrayList<WikipediaEntity> ents = getCategories("52648");
// for(WikipediaEntity we : ents){
// System.out.println(we.getText()+" "+we.getWikiminerID());
// }
System.out.println(getXML("India", false));
}
}