package com.personalityextractor.entity.extractor.frequencybased; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import com.personalityextractor.commons.FileRW; import com.personalityextractor.commons.ReadJSON; import com.personalityextractor.commons.data.Tweet; import com.personalityextractor.data.source.Wikiminer; import com.personalityextractor.entity.WikipediaEntity; import com.personalityextractor.entity.extractor.EntityExtractFactory; import com.personalityextractor.entity.extractor.IEntityExtractor; import com.personalityextractor.entity.extractor.EntityExtractFactory.Extracter; import com.personalityextractor.url.data.URLEntityExtractor; import cs224n.util.Counter; import cs224n.util.CounterMap; import cs224n.util.PriorityQueue; public class TopNPExtractor implements IFrequencyBasedExtractor { double threshold = 1.0; CounterMap<String, String> cooccurence = new CounterMap<String, String>(); @Override public Counter<String> extract(List<String> allTweets) { HashSet<String> distTweets = new HashSet<String>(allTweets); allTweets = new ArrayList<String>(distTweets); List<String> newList = new ArrayList<String>(); for (int i = 0; i < allTweets.size(); i++) { if (!allTweets.get(i).startsWith("@")) { newList.add(allTweets.get(i)); } } allTweets = newList; Counter<String> entityCounter = new Counter<String>(); IEntityExtractor extractor = EntityExtractFactory .produceExtractor(Extracter.NOUNPHRASE); for (String tw : allTweets) { try { System.out.println(tw); HashSet<String> entitiesinTweet = new HashSet<String>(); Tweet tweet = new Tweet(tw); for (String sentence : tweet.getSentences()) { List<String> entities = extractor.extract(sentence); if (entities != null) { entitiesinTweet.addAll(entities); entityCounter.incrementAll(entities, 1.0); } } for (String link : tweet.getLinks()) { List<String> entities = URLEntityExtractor .extractEntitiesinTitle(link, extractor); if (entities != null) { entitiesinTweet.addAll(entities); entityCounter.incrementAll(entities, 1.0); } } entitiesinTweet.addAll(tweet.getHashTags()); entityCounter.incrementAll(tweet.getHashTags(), 1.0); System.out.println(entitiesinTweet); for (String ent1 : entitiesinTweet) { for (String ent2 : entitiesinTweet) { if (!ent1.equalsIgnoreCase(ent2)) { cooccurence.incrementCount(ent1, ent2, 1.0); } } } } catch (Exception e) { e.printStackTrace(); } } // apply cutoff Counter<String> finalEntityCounter = new Counter<String>(); for (String entity : entityCounter.keySet()) { double count = entityCounter.getCount(entity); if (count > this.threshold) { finalEntityCounter.setCount(entity, count); } } return finalEntityCounter; } public HashMap<String, WikipediaEntity> resolve(Counter<String> entityCounter) { HashMap<String, WikipediaEntity> resolvedEntities = new HashMap<String, WikipediaEntity>(); for (String entity : entityCounter.keySet()) { Counter<WikipediaEntity> senseCounter = new Counter<WikipediaEntity>(); String entityXML = Wikiminer.getXML(entity, false); if (entityXML == null) continue; List<String> oentities = new ArrayList<String>(); if (cooccurence.keySet().contains(entity)) { PriorityQueue<String> sorted = cooccurence.getCounter(entity) .asPriorityQueue(); int count = 0; while (sorted.hasNext() && count < 5) { oentities.add(sorted.next()); count++; } } for (String oentity : oentities) { if (Wikiminer.getXML(oentity, false) == null) continue; if (entity.equalsIgnoreCase(oentity)) continue; // System.out.println("comparing with " + oentity); HashMap<String, WikipediaEntity> relativeBestSenses = Wikiminer .getRelativeBestSenses(entity, oentity); if (relativeBestSenses != null && relativeBestSenses.containsKey(entity)) { senseCounter.incrementCount(relativeBestSenses.get(entity), 1.0); } } if (senseCounter.argMax() != null) { WikipediaEntity we = senseCounter.argMax(); //we.count = entityCounter.getCount(entity); resolvedEntities.put(entity, we); } else { List<WikipediaEntity> wikiSenses = Wikiminer.getWikipediaEntities(entityXML, false); try{ resolvedEntities.put(entity, wikiSenses.get(0)); }catch(Exception e){ e.printStackTrace(); System.out.println("Entity"+ entity); System.out.println(entityXML); } } } return resolvedEntities; } public static double getOverlap(String ent1, String ent2){ String[] ent1_split = ent1.toLowerCase().split("\\s+"); String[] ent2_split = ent2.toLowerCase().split("\\s+"); double overlap = 0.0; for(String e1 : ent1_split){ for(String e2 : ent2_split){ if(e1.equalsIgnoreCase(e2)) overlap++; } } return (overlap/(ent1_split.length + ent2_split.length)); } public static void main(String[] args) { System.out.println(getOverlap("Capital Fund Focused", "Capital (economics)")); // TopNPExtractor tt = new TopNPExtractor(); // // ReadJSON rjs = new ReadJSON(); // List<String> json = FileRW.getLinesinFile(args[0]); // List<String> tweets = new ArrayList<String>(); // for (String son : json) { // if (son.length() == 0) // continue; // tweets.addAll(rjs.parseJSONArray(son)); // } // // Counter<String> entities = tt.extract(tweets); // System.out.println("Done getting entities.."); // // PriorityQueue<String> pq = entities.asPriorityQueue(); // while (pq.hasNext()) { // String entity = pq.next(); // System.out.println(entity + " TotalCount:" // + entities.getCount(entity)); // PriorityQueue<String> pq1 = tt.cooccurence.getCounter(entity) // .asPriorityQueue(); // int count = 0; // while (pq1.hasNext() && count < 5) { // count++; // String key = pq1.next(); // System.out.println(key + " Cooccurence: " // + tt.cooccurence.getCount(entity, key)); // } // System.out.println("\n"); // } // // HashMap<String, WikipediaEntity> resolved = tt.resolve(entities); // for (String entity : resolved.keySet()) { // WikipediaEntity we = resolved.get(entity); // System.out.println("Entity: " + entity + " Count: " // + entities.getCount(entity) + " Resolution: " // + we.getText() + " " + we.getWikiminerID()); // } // } }