package processing.hashtag.baseline; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.ObjectInput; import java.io.ObjectInputStream; import java.io.ObjectOutput; import java.io.ObjectOutputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Map; import net.sf.javaml.utils.MathUtils; public class HashtagEntropyCalculator { /** * Compute the hashtag entropy of all the hashtags and create the entropy. * * @return */ public static HashMap<Integer, Double> computeAllHashtagEntropyMap( HashMap<String, HashMap<Integer, ArrayList<Long>>> userTagTimestamps, int numberOfIntervals) { // compute the duration for which the dataset is there starting from the // first. Duration globalUsageDuration = computeWholeDatasetHashtagUsageDuration(userTagTimestamps); // Given the global usage duration and total number of duration interval // find out the duration interval. long durationSingleInterval = computeSingleIntervalDuration(globalUsageDuration, numberOfIntervals); // Given the global usage duration and duration of single interval ArrayList<Duration> binList = getIntervalDurationList(globalUsageDuration, durationSingleInterval); // number of times a hashtag occur in an interval. HashMap<Integer, HashMap<Integer, Integer>> hashtagIntervalCount = computeIntervalHashtagCount( userTagTimestamps, binList); // sum of usage of hashtags in all such intervals. HashMap<Integer, Double> hashtagAllIntervalSumCount = computeHashtagIntervalCountSum(hashtagIntervalCount, binList); // probability of a hashtag for a given interval. HashMap<Integer, HashMap<Integer, Double>> hashtagIntervalProbabilityScores = computeHashtagIntervalProbabilityScores( hashtagIntervalCount, hashtagAllIntervalSumCount, binList.size()); // create a hashtag entropy map for the users. HashMap<Integer, Double> hashtagEntropyMap = computeHashtagEntropyValue(hashtagIntervalProbabilityScores, binList.size()); return hashtagEntropyMap; } /** * * Set the minimum timestamp as the start timestamp and the maximum * timestamp as end timestamp in the dataset. * @param userTagTimestamps * a hashmap mapping users to tags and ArrayList of timestamps * @return */ private static Duration computeWholeDatasetHashtagUsageDuration( HashMap<String, HashMap<Integer, ArrayList<Long>>> userTagTimestamps) { //TODO: Efficiency Duration duration = new Duration(); for (String user : userTagTimestamps.keySet()) { if (userTagTimestamps.get(user) != null) { for (Integer tag : userTagTimestamps.get(user).keySet()) { if (userTagTimestamps.get(user).get(tag) != null) { ArrayList<Long> hastagUsageTimestampList = userTagTimestamps.get(user).get(tag); for (Long timestamp : hastagUsageTimestampList) { if (timestamp < duration.getStartTime() || duration.getStartTime() == 0) { duration.setStartTime(timestamp); } else if (timestamp > duration.getEndTime()) { duration.setEndTime(timestamp); } } } } } } return duration; } /** * Compute the single interval duration given global interval and number of * intervals within the global duration. * * @param duration * @param numberOfIntervals * @return */ private static int computeSingleIntervalDuration(Duration duration, int numberOfIntervals) { int interval = 0; double durationInterval = duration.getEndTime() - duration.getStartTime(); interval = (int) durationInterval / numberOfIntervals; if(interval == 0){ interval = 1; } return interval; } /** * Get the list of interval duration starting from the start time of * {@code globalUsageDuration}. * * @param globalUsageDuration * @param intervalDuration * @return */ private static ArrayList<Duration> getIntervalDurationList(Duration globalUsageDuration, long intervalDuration) { ArrayList<Duration> durationArrayList = new ArrayList<Duration>(); long currentTime = globalUsageDuration.getStartTime(); Duration duration = new Duration(); while (currentTime <= globalUsageDuration.getEndTime()) { duration = new Duration(); duration.setStartTime(currentTime); duration.setEndTime(currentTime + intervalDuration); durationArrayList.add(duration); currentTime += intervalDuration; } return durationArrayList; } /** * Convert userTagTimestamp map to hashtagIntervalCount map. * * @param userTagTimes * @return */ private static HashMap<Integer, HashMap<Integer, Integer>> computeIntervalHashtagCount( HashMap<String, HashMap<Integer, ArrayList<Long>>> userTagTimes, ArrayList<Duration> binList) { HashMap<Integer, HashMap<Integer, Integer>> hashtagIntervalCount = new HashMap<Integer, HashMap<Integer, Integer>>(); for (String user : userTagTimes.keySet()) { for (Integer tag : userTagTimes.get(user).keySet()) { ArrayList<Long> timestamps = userTagTimes.get(user).get(tag); for (Long timestamp : timestamps) { int intervalIndex = getIntervalIndex(timestamp, binList); if (!hashtagIntervalCount.containsKey(tag)) { hashtagIntervalCount.put(tag, new HashMap<Integer, Integer>()); } if (!hashtagIntervalCount.get(tag).containsKey(intervalIndex)) { hashtagIntervalCount.get(tag).put(intervalIndex, 0); } hashtagIntervalCount.get(tag).put(intervalIndex, hashtagIntervalCount.get(tag).get(intervalIndex) + 1); } } } return hashtagIntervalCount; } /** * Compute hashtag count sum for each hashtag over all intervals. * * @param hashtagIntervalCount * @return */ private static HashMap<Integer, Double> computeHashtagIntervalCountSum( HashMap<Integer, HashMap<Integer, Integer>> hashtagIntervalCount, ArrayList<Duration> binList) { HashMap<Integer, Double> hashtagAllIntervalCountSum = new HashMap<Integer, Double>(); for (Integer hashtag : hashtagIntervalCount.keySet()) { HashMap<Integer, Integer> intervalCount = hashtagIntervalCount.get(hashtag); if (intervalCount != null) { double denominatorFactor = computeSumHashtagAllIntervalCount(intervalCount) + 0.01 * binList.size(); if (!hashtagAllIntervalCountSum.containsKey(hashtag)) { hashtagAllIntervalCountSum.put(hashtag, denominatorFactor); } } } return hashtagAllIntervalCountSum; } /** * Compute the sum of count for a hashtag over all the intervals. * * @param intervalCount * @return */ private static int computeSumHashtagAllIntervalCount(HashMap<Integer, Integer> intervalCount) { int sumOverInterval = 0; for (int interval : intervalCount.keySet()) { sumOverInterval += intervalCount.get(interval); } return sumOverInterval; } /** * Compute the probability score of a user within a given interval. * * @param hashtagIntervalCount * @param hashtagAllIntervalSumCount * @return */ private static HashMap<Integer, HashMap<Integer, Double>> computeHashtagIntervalProbabilityScores( HashMap<Integer, HashMap<Integer, Integer>> hashtagIntervalCount, HashMap<Integer, Double> hashtagAllIntervalSumCount, int numberOfInterval) { HashMap<Integer, HashMap<Integer, Double>> hashtagIntervalProbabilityScores = new HashMap<Integer, HashMap<Integer, Double>>(); for (Integer tag : hashtagIntervalCount.keySet()) { if (hashtagIntervalCount.get(tag) != null) { for (Integer interval : hashtagIntervalCount.get(tag).keySet()) { double probabilityScore = (hashtagIntervalCount.get(tag).get(interval) + 0.01) / ((hashtagAllIntervalSumCount.get(tag) + (0.01) * numberOfInterval )); if (!hashtagIntervalProbabilityScores.containsKey(tag)) { hashtagIntervalProbabilityScores.put(tag, new HashMap<Integer, Double>()); } /*if (!hashtagIntervalProbabilityScores.get(tag).containsKey(interval)) { hashtagIntervalProbabilityScores.get(tag).put(interval, 0.0); }*/ hashtagIntervalProbabilityScores.get(tag).put(interval, probabilityScore); } } } return hashtagIntervalProbabilityScores; } /** * Compute and get the entropy score for all the hashtags. The entropy score * is for a particular hashtag. * * @param hashtagIntervalProbabilityScores * @return */ private static HashMap<Integer, Double> computeHashtagEntropyValue( HashMap<Integer, HashMap<Integer, Double>> hashtagIntervalProbabilityScores, int numberOfIntervals) { HashMap<Integer, Double> hashtagEntropyScoreMap = new HashMap<Integer, Double>(); for (Integer hashtag : hashtagIntervalProbabilityScores.keySet()) { double hashtagEntropyNumerator = 0.0; if (hashtagIntervalProbabilityScores.get(hashtag) != null) { for (Integer interval : hashtagIntervalProbabilityScores.get(hashtag).keySet()) { if (hashtagIntervalProbabilityScores.get(hashtag).get(interval) != null) { hashtagEntropyNumerator += hashtagIntervalProbabilityScores.get(hashtag).get(interval) * MathUtils.log2(hashtagIntervalProbabilityScores.get(hashtag).get(interval)); } } } double hashtagEntropyDenominator = MathUtils.log2(numberOfIntervals); double hashtagEntropyScore = - (hashtagEntropyNumerator) / (hashtagEntropyDenominator); hashtagEntropyScoreMap.put(hashtag, hashtagEntropyScore); } //System.out.println(" >> hashtagScore >> " + hashtagEntropyScoreMap); return hashtagEntropyScoreMap; } /** * Serialize the entropy value for the dataset. * @param hashtagEntropyMap * @param filePath */ public static void serializeHashtagEntropy(Map<Integer, Double> hashtagEntropyMap, String filePath) { OutputStream file = null; try { file = new FileOutputStream(filePath); OutputStream buffer = new BufferedOutputStream(file); ObjectOutput output = new ObjectOutputStream(buffer); output.writeObject(hashtagEntropyMap); output.close(); } catch (Exception e) { e.printStackTrace(); } } /** * Deserialize the Entropy Map. * @param filePath * @return */ public static Map<Integer, Double> deSerializeHashtagEntropy(String filePath) { InputStream file = null; Map<Integer, Double> hashtagEntropyMap = null; try { file = new FileInputStream(filePath); InputStream buffer = new BufferedInputStream(file); ObjectInput input = new ObjectInputStream(buffer); hashtagEntropyMap = (Map<Integer, Double>) input.readObject(); input.close(); } catch (Exception e) { e.printStackTrace(); } return hashtagEntropyMap; } /** * Get the index of the time interval in which timestamp falls. * * @param timestamp * @param intervalList * @return */ private static int getIntervalIndex(long timestamp, ArrayList<Duration> intervalList) { int intervalIndex = -1; for (Duration duration : intervalList) { if (timestamp >= duration.getStartTime() && timestamp <= duration.getEndTime()) { intervalIndex = intervalList.indexOf(duration); } else { continue; } } if (intervalIndex == -1) { throw new RuntimeException(); } else { return intervalIndex; } } /** * * @author spujari * */ private static class Duration { private long startTime; private long endTime; public long getStartTime() { return startTime; } public void setStartTime(long startTime) { this.startTime = startTime; } public long getEndTime() { return endTime; } public void setEndTime(long endTime) { this.endTime = endTime; } public double getDuration() { return (endTime - startTime); } } }