package com.personalityextractor.entity.extractor.frequencybased;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import com.personalityextractor.commons.FileRW;
import com.personalityextractor.commons.ReadJSON;
import com.personalityextractor.commons.data.Tweet;
import com.personalityextractor.data.source.Twitter;
import com.personalityextractor.data.source.Wikiminer;
import com.personalityextractor.entity.WikipediaEntity;
import com.personalityextractor.entity.extractor.EntityExtractFactory;
import com.personalityextractor.entity.extractor.IEntityExtractor;
import com.personalityextractor.entity.extractor.SennaNounPhraseExtractor;
import com.personalityextractor.entity.extractor.EntityExtractFactory.Extracter;
import com.personalityextractor.url.data.URLEntityExtractor;
import cs224n.util.Counter;
import cs224n.util.CounterMap;
import cs224n.util.PriorityQueue;
public class TopNNPHashTagsExtractor implements IFrequencyBasedExtractor {
double threshold = 1.0;
HashMap<String, Tweet> tweetIDs = new HashMap<String, Tweet>();
HashMap<String, ArrayList<String>> entityToTweetIDs = new HashMap<String, ArrayList<String>>();
HashMap<String, ArrayList<String>> tweetToEntities = new HashMap<String, ArrayList<String>>();
CounterMap<String, String> cooccurence = new CounterMap<String, String>();
public void buildTweetIDMap(List<String> tweets) {
int count = 0;
for (String tw : tweets) {
count++;
tweetIDs.put(String.valueOf(count), new Tweet(tw));
}
}
@Override
public Counter<String> extract(List<String> allTweets) {
HashSet<String> distTweets = new HashSet<String>(allTweets);
allTweets = new ArrayList<String>(distTweets);
System.out.println(allTweets.size());
buildTweetIDMap(allTweets);
Counter<String> entityCounter = new Counter<String>();
IEntityExtractor extractor = EntityExtractFactory.produceExtractor(Extracter.NOUNPHRASE);
for (String id : tweetIDs.keySet()) {
HashSet<String> entitiesInTweet = new HashSet<String>();
Tweet tweet = tweetIDs.get(id);
if (!tweetToEntities.containsKey(id)) {
ArrayList<String> ents = new ArrayList<String>();
tweetToEntities.put(id, ents);
}
for (String s : tweet.getSentences()) {
List<String> entities = extractor.extract(s.replaceAll("&", "and"));
entitiesInTweet.addAll(entities);
for (String entity : entities) {
tweetToEntities.get(id).add(entity);
entityCounter.incrementCount(entity, 1.0);
if (entityToTweetIDs.containsKey(entity)) {
if (entityToTweetIDs.get(entity).contains(id))
continue;
entityToTweetIDs.get(entity).add(id);
} else {
ArrayList<String> tweets = new ArrayList<String>();
tweets.add(id);
entityToTweetIDs.put(entity, tweets);
}
}
}
if (tweet.getLinks() != null) {
for (String link : tweet.getLinks()) {
List<String> entities = URLEntityExtractor.extractEntitiesinTitle(link, extractor);
entities.addAll(URLEntityExtractor.extractTopEntities(link, extractor));
if (entities != null) {
entitiesInTweet.addAll(entities);
for (String entity : entities) {
tweetToEntities.get(id).add(entity);
entityCounter.incrementCount(entity, 1.0);
if (entityToTweetIDs.containsKey(entity)) {
if (entityToTweetIDs.get(entity).contains(id))
continue;
entityToTweetIDs.get(entity).add(id);
} else {
ArrayList<String> tweets = new ArrayList<String>();
tweets.add(id);
entityToTweetIDs.put(entity, tweets);
}
}
}
}
}
for (String hashTag : tweet.getHashTags()) {
entitiesInTweet.add(hashTag);
tweetToEntities.get(id).add(hashTag);
entityCounter.incrementCount(hashTag, 1.0);
if (entityToTweetIDs.containsKey(hashTag)) {
if (entityToTweetIDs.get(hashTag).contains(id))
continue;
entityToTweetIDs.get(hashTag).add(id);
} else {
ArrayList<String> tweets = new ArrayList<String>();
tweets.add(id);
entityToTweetIDs.put(hashTag, tweets);
}
}
for (String ent1 : entitiesInTweet) {
for (String ent2 : entitiesInTweet) {
if (!ent1.equalsIgnoreCase(ent2)) {
cooccurence.incrementCount(ent1, ent2, 1.0);
}
}
}
}
// get common nouns from tweets containing # tags >1 occurrence
SennaNounPhraseExtractor sp = new SennaNounPhraseExtractor();
PriorityQueue<String> keys = entityCounter.asPriorityQueue();
while (keys.hasNext()) {
String entity = keys.next();
if (entity.startsWith("#") && entityCounter.getCount(entity) > 1) {
for (String id : entityToTweetIDs.get(entity)) {
Tweet t = tweetIDs.get(id);
for (String sent : t.getSentences()) {
String sennaoutput = SennaNounPhraseExtractor
.getSennaOutput(sent);
for (String np : sp.getProperNounPhrases(sennaoutput)) {
entityCounter.incrementCount(np, 1.0);
tweetToEntities.get(id).add(np);
if (entityToTweetIDs.containsKey(np)) {
if (entityToTweetIDs.get(np).contains(id))
continue;
entityToTweetIDs.get(np).add(id);
} else {
ArrayList<String> tweets = new ArrayList<String>();
tweets.add(id);
entityToTweetIDs.put(np, tweets);
}
}
for (String np : sp.getCommonNounPhrases(sennaoutput)) {
entityCounter.incrementCount(np, 2.0);
tweetToEntities.get(id).add(np);
if (entityToTweetIDs.containsKey(np)) {
if (entityToTweetIDs.get(np).contains(id))
continue;
entityToTweetIDs.get(np).add(id);
} else {
ArrayList<String> tweets = new ArrayList<String>();
tweets.add(id);
entityToTweetIDs.put(np, tweets);
}
}
}
}
}
}
// apply cutoff
Counter<String> finalEntityCounter = new Counter<String>();
for (String entity : entityCounter.keySet()) {
double count = entityCounter.getCount(entity);
if (count > this.threshold) {
finalEntityCounter.setCount(entity, count);
}
}
return finalEntityCounter;
}
public HashMap<String, WikipediaEntity> resolve(Counter<String> entityCounter) {
HashMap<String, WikipediaEntity> resolvedEntities = new HashMap<String, WikipediaEntity>();
PriorityQueue<String> q = entityCounter.asPriorityQueue();
int entity_count =0;
while (q.hasNext()) {
entity_count++;
if(entity_count>100){
break;
}
String entity = q.next();
Counter<WikipediaEntity> senseCounter = new Counter<WikipediaEntity>();
String entityXML = Wikiminer.getXML(entity, false);
if (entityXML == null)
continue;
//System.out.println("Resolving : " + entity);
List<String> oentities = new ArrayList<String>();
if (cooccurence.keySet().contains(entity)) {
PriorityQueue<String> sorted = cooccurence.getCounter(entity).asPriorityQueue();
int count =0;
// while (sorted.hasNext() && count<5) {
while (sorted.hasNext()) {
oentities.add(sorted.next());
count++;
}
} else {
for (String id : entityToTweetIDs.get(entity)) {
List<String> alloentities = tweetToEntities.get(id);
for (String e : alloentities) {
if (!oentities.contains(e)) {
oentities.add(e);
}
}
}
}
for (String oentity : oentities) {
if (Wikiminer.getXML(oentity, false) == null)
continue;
if (entity.equalsIgnoreCase(oentity))
continue;
//System.out.println("comparing with " + oentity);
HashMap<String, WikipediaEntity> relativeBestSenses = Wikiminer
.getRelativeBestSenses(entity, oentity);
if (relativeBestSenses != null
&& relativeBestSenses.containsKey(entity)) {
senseCounter.incrementCount(relativeBestSenses.get(entity),
1.0);
}
}
if (senseCounter.argMax() != null) {
System.out.println("Resolved: " + senseCounter.argMax().getText());
WikipediaEntity e = senseCounter.argMax();
e.count += entityCounter.getCount(entity);
resolvedEntities.put(entity, e);
} else {
List<WikipediaEntity> wikiSenses = Wikiminer
.getWikipediaEntities(entityXML, false);
resolvedEntities.put(entity, wikiSenses.get(0));
System.out.println("Resolved: " + wikiSenses.get(0).getText());
}
}
return resolvedEntities;
}
public static void main(String[] args) {
TopNNPHashTagsExtractor tt = new TopNNPHashTagsExtractor();
//ReadJSON rjs = new ReadJSON();
//List<String> json = FileRW.getLinesinFile(args[0]);
Twitter t = new Twitter();
List<String> tweets = t.fetchTweets("werner", 200);
System.out.println(tweets.size());
// for (String son : json) {
// if(son.length()==0)
// continue;
// tweets.addAll(rjs.parseJSONArray(son));
// }
Counter<String> entities = tt.extract(tweets);
System.out.println("Done getting entities..");
PriorityQueue<String> pq = entities.asPriorityQueue();
while (pq.hasNext()) {
String entity = pq.next();
System.out.println(entity + " TotalCount:" + entities.getCount(entity));
PriorityQueue<String> pq1 = tt.cooccurence.getCounter(entity).asPriorityQueue();
int count =0;
while (pq1.hasNext() && count <5) {
count++;
String key = pq1.next();
System.out.println(key+" Cooccurence: "+tt.cooccurence.getCount(entity, key));
}
System.out.println("\n");
}
HashMap<String, WikipediaEntity> resolved = tt.resolve(entities);
for (String entity : resolved.keySet()) {
WikipediaEntity we = resolved.get(entity);
System.out.println("Entity: " + entity + " Count: "
+ entities.getCount(entity) + " Resolution: "
+ we.getText() + " " + we.getWikiminerID());
}
}
}