package com.personalityextractor.entity.resolver; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import tathya.db.YahooBOSS; import tathya.text.tokenizer.TwitterTokenizer; import com.personalityextractor.data.source.Wikiminer; import com.personalityextractor.entity.Entity; import com.personalityextractor.entity.extractor.IEntityExtractor; import com.personalityextractor.entity.extractor.NounPhraseExtractor; import cs224n.util.Counter; import cs224n.util.PriorityQueue; public class ExtractEntities implements IEntityExtractor { static final HashSet<String> stopWords = new HashSet<String>(); static NounPhraseExtractor npe = new NounPhraseExtractor(); static { try { BufferedReader br = new BufferedReader( new FileReader( "/Users/tejaswi/Documents/workspace/PersonalityExtraction/data/chimps_list-of-english-stopwords-2010-07-01_00-50-22/english_stopwords.tsv")); String line = ""; while ((line = br.readLine()) != null) { stopWords.add(line.trim()); } } catch (Exception e) { e.printStackTrace(); } } public ArrayList<String> extract(String record){ ArrayList<String> allEntities = new ArrayList<String>(); for(String s : getEntitiesinTweet(record)){ allEntities.add(s.trim()); } return allEntities; } public static HashSet<String> getCapitalizedWords(String token) { HashSet<String> capitalWords = new HashSet<String>(); try { Pattern p = Pattern.compile("^[A-Z]+.*"); String[] split = token.split("\\s+"); for (String s : split) { if (p.matcher(s).matches()) { capitalWords.add(s.toLowerCase()); } } return capitalWords; } catch (Exception e) { e.printStackTrace(); } return null; } public HashSet<String> getEntitiesinTweet(String tweet){ HashSet<String> entities = new HashSet<String>(); TwitterTokenizer tweetTokenizer = new TwitterTokenizer(); for (String token : tweetTokenizer.tokenize(tweet)) { token = token.trim(); token = token.replaceAll("( [^a-zA-Z0-9\\.]) | ( [^a-zA-Z0-9\\.] ) | ([^a-zA-Z0-9\\.] )"," "); try { Pattern p = Pattern.compile("^[A-Z]+.*"); String[] split = token.split("\\s+"); for (String s : split) { s=s.trim(); if (p.matcher(s).matches() && !stopWords.contains(s.toLowerCase())) { entities.add(s); } } }catch (Exception e) { e.printStackTrace(); } for (String np : npe.extract(token)) { if (!stopWords.contains(np.trim().toLowerCase())){ entities.add(np.trim()); } } } return entities; } public List<String[]> getRankedEntities(String entity, List<String> contextPhrases){ List<String[]> rankedEntities = new ArrayList<String[]>(); PriorityQueue<String[]> queue = new PriorityQueue<String[]>(); contextPhrases.remove(entity); StringBuffer contextQuery = new StringBuffer(); for (String c : contextPhrases) { contextQuery.append("\"" + c + "\"" + " "); } int contextCount = YahooBOSS.makeQuery(contextQuery.toString()); String xml = ""; if((xml=Wikiminer.getXML(entity, false))!=null){ ArrayList<String[]> senses = Wikiminer.getWikipediaSenses(xml, true); for(String[] senseArr : senses){ int senseCount = YahooBOSS.makeQuery('"' + senseArr[0] + "\" "+contextQuery.toString()); queue.add(senseArr, ((double) senseCount / (double) contextCount)); } } while(queue.hasNext()){ rankedEntities.add(queue.next()); } return rankedEntities; } public static void main(String[] args) { ExtractEntities ee = new ExtractEntities(); DocumentBuilder db = null; DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); BufferedWriter bw = null; BufferedReader br = null; try{ br = new BufferedReader(new FileReader("data/"+args[0]+".txt")); bw = new BufferedWriter(new FileWriter("data/"+args[0]+"_wiki_entities.txt")); }catch(Exception e){ e.printStackTrace(); } try { db = dbf.newDocumentBuilder(); } catch (Exception e) { e.printStackTrace(); } String line = ""; try { while ((line = br.readLine()) != null) { bw.write("=====================================================\n"); bw.write("Tweet: "+line+"\n"); bw.write("=====================================================\n"); HashSet<String> entities = ee.getEntitiesinTweet(line); List<String> contextPhrases = new ArrayList<String>(entities); for(String entity: entities){ bw.write("query: "+entity+"\n"); List<String[]> rankedEntities = ee.getRankedEntities(entity, contextPhrases); //bw.write("RankedEntities: "+rankedEntities+"\n"); for(String[] entityArr : rankedEntities){ bw.write("Entity: "+entityArr[0]+"\n"); String xml = Wikiminer.getXML(entityArr[1], true); bw.write("RankedTypes: "+Wikiminer.getRankedTypes(entityArr[0], xml, contextPhrases, 5)+"\n"); } bw.write("--------------------------------------------------\n"); } bw.write("\n"); } bw.flush(); } catch (Exception e) { e.printStackTrace(); } } public HashMap<String, Entity> getAllEntities(String handle){ HashMap<String, Entity> allEntities = new HashMap<String, Entity>(); try { BufferedReader br = new BufferedReader(new FileReader("data/"+handle+".txt")); BufferedWriter bw = new BufferedWriter(new FileWriter("data/"+handle+"_entities.txt")); BufferedWriter bw1 = new BufferedWriter(new FileWriter("data/"+handle+"_statistics.txt")); String line = ""; Counter<String> nPhraseCounter = new Counter<String>(); Counter<String> capitalsCounter = new Counter<String>(); while ((line = br.readLine()) != null) { line = line.replaceAll("RT", ""); TwitterTokenizer tweetTokenizer = new TwitterTokenizer(); for (String token : tweetTokenizer.tokenize(line)) { token = token.trim(); token = token.replaceAll("( [^a-zA-Z0-9\\.]) | ( [^a-zA-Z0-9\\.] ) | ([^a-zA-Z0-9\\.] )"," "); ArrayList<String> nPhrases = new ArrayList<String>(); HashSet<String> capitalWords = new HashSet<String>(); try { Pattern p = Pattern.compile("^[A-Z]+.*"); String[] split = token.split("\\s+"); for (String s : split) { if (p.matcher(s).matches() && !stopWords.contains(s.toLowerCase())) { capitalWords.add(s.toLowerCase()); capitalsCounter.incrementCount(s.toLowerCase(), 1.0); if(allEntities.containsKey(s.trim())){ Entity e = allEntities.get(s.trim()); if(!e.tweets.contains(line)){ e.tweets.add(line); allEntities.put(s.trim(), e); } } else{ Entity e = new Entity(s.trim()); e.tweets.add(line); allEntities.put(s.trim(), e); } } } } catch (Exception e) { e.printStackTrace(); } bw.write("===============================================\n"); bw.write(token + "\n"); System.out.println("token: " + token); for (String np : npe.extract(token)) { if (!stopWords.contains(np.trim().toLowerCase())){ nPhrases.add(np.trim()); nPhraseCounter.incrementCount(np.trim(), 1.0); if(allEntities.containsKey(np.trim())){ Entity e = allEntities.get(np.trim()); if(!e.tweets.contains(line)){ e.tweets.add(line); allEntities.put(np.trim(), e); } } else{ Entity e = new Entity(np.trim()); e.tweets.add(line); allEntities.put(np.trim(), e); } } } bw.write("===============================================\n"); bw.write("Noun-Phrases: " + nPhrases.toString() + "\n"); // HashSet<String> capitalWords = // getCapitalizedWords(token); if (capitalWords == null) { bw.write("No capitals\n\n"); } else { bw.write("Capitals: " + capitalWords.toString() + "\n\n"); } } bw.flush(); if (true) continue; } PriorityQueue<String> nPhraseQueue = nPhraseCounter.asPriorityQueue(); PriorityQueue<String> capitalQueue = capitalsCounter.asPriorityQueue(); while(nPhraseQueue.hasNext()){ String np = nPhraseQueue.next(); bw1.write(np+" "+nPhraseCounter.getCount(np)+"\n"); } bw1.write("=========================================================\n"); while(capitalQueue.hasNext()){ String cap = capitalQueue.next(); bw1.write(cap+" "+capitalsCounter.getCount(cap)+"\n"); } bw1.flush(); } catch (Exception e) { e.printStackTrace(); } return allEntities; } // public static void main(String[] args){ // ExtractEntities ee = new ExtractEntities(); // HashMap<String, Entity> allEntities = ee.getAllEntities(args[0]); // BufferedWriter bw = null; // FreebaseWrapper fb = FreebaseWrapper.getInstance(); // try{ // bw = new BufferedWriter(new FileWriter("data/"+args[0]+"_freebase_entities.txt")); // for (String entity : allEntities.keySet()) { // if(entity.trim().length()<2) // continue; // bw.write("=====================================================\n"); // bw.write("Entity: "+entity+"\n"); // System.out.println("Entity is "+entity); // bw.write("=====================================================\n"); // List<JSON> rankedEntities = fb.getRankedEntities(entity, 10, allEntities.get(entity).tweets); // for(int i=0; i<rankedEntities.size(); i++){ // JSON rentity = rankedEntities.get(i); // bw.write("[Entity: " // + (String) rentity.get("name").value() + " " // + (String) rentity.get("id").value()+"]"); // } // Entity e = allEntities.get(entity); // bw.write("\n-----------------------------------------------------\n"); // for(String tweet : e.tweets) { // bw.write(tweet+"\n"); // } // bw.write("\n"); // } // bw.flush(); // // }catch(Exception e){ // e.printStackTrace(); // } // // } // // public static void main(String[] args) { // ExtractEntities ee = new ExtractEntities(); // HashMap<String, Entity> allEntities = ee.getAllEntities(args[0]); // DocumentBuilder db = null; // DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); // BufferedWriter bw = null; // try{ // bw = new BufferedWriter(new FileWriter("data/"+args[0]+"_wiki_entities.txt")); // }catch(Exception e){ // e.printStackTrace(); // } // try { // db = dbf.newDocumentBuilder(); // } catch (Exception e) { // e.printStackTrace(); // } // // for (String entity : allEntities.keySet()) { // String xml = ""; // if ((xml = Wikiminer.getXML(entity, false)) != null) { // try { // ArrayList<String[]> senses = Wikiminer.getWikipediaSenses(xml, true); // bw.write("=====================================================\n"); // bw.write("Entity: "+entity+"\n"); // bw.write("=====================================================\n"); // // ArrayList<String> contextPhrases = new ArrayList<String>(); // for(String tweet : allEntities.get(entity).tweets) // contextPhrases.addAll(ee.getEntitiesinTweet(tweet)); // // PriorityQueue<String> sensesQueue = new PriorityQueue<String>(); // for(String[] sense : senses){ // if(sense[0].equalsIgnoreCase("wikipedia entry")) { // bw.write(sense[0]+"\n"); // continue; // } // String xmlSense = Wikiminer.getXML(sense[1], true); // sensesQueue.add(sense[0], Wikiminer.getPMI(xmlSense, contextPhrases)); // } // // bw.write(sensesQueue.toString()+"\n"); // Entity e = allEntities.get(entity); // bw.write("-----------------------------------------------------\n"); // for(String tweet : e.tweets) { // bw.write(tweet+"\n"); // } // bw.write("\n"); // bw.flush(); // } catch (Exception e) { // e.printStackTrace(); // System.out.println(xml); // } // } // // } // } }