/** * */ package com.maalaang.omtwitter.corpus; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStreamWriter; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import org.apache.log4j.Logger; import twitter4j.FilterQuery; import twitter4j.HashtagEntity; import twitter4j.Query; import twitter4j.QueryResult; import twitter4j.StatusListener; import twitter4j.Tweet; import twitter4j.Twitter; import twitter4j.TwitterException; import twitter4j.TwitterFactory; import twitter4j.TwitterStream; import twitter4j.TwitterStreamFactory; import com.maalaang.omtwitter.model.OMTweet; /** * @author Sangwon Park * */ public class TwitterCorpusConstructor { private Logger logger = null; private static String FILE_DATE_FORMAT = "yyyyMMdd_HHmmss"; public TwitterCorpusConstructor() { logger = Logger.getLogger(getClass()); } /** * Construct raw corpus by searching Twitter based on the generated queries. * Fields: ID AUTHOR DATE QUERY TEXT * @param queries * @param rawCorpusFile * @param rpp * @param max * @param lang * @param interval * @param retryNum * @param retryInterval * @throws IOException * @throws InterruptedException */ public void constructCorpusBySearch(Set<String> queries, String rawCorpusFile, int rpp, int max, String lang, int interval, int retryNum, int retryInterval) throws IOException, InterruptedException { BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(rawCorpusFile), "UTF-8")); long resultTweetTotalCnt = 0; int queryTotalNum = queries.size(); int queryProcessCnt = 0; SimpleDateFormat dateFormat = new SimpleDateFormat(OMTweet.DATE_FORMAT); for (String query : queries) { List<QueryResult> res = null; int tryCount = retryNum; while (tryCount-- > 0) { try { res = searchTwitter(query, rpp, max, lang); break; } catch (TwitterException e) { logger.error(e); logger.info("failed to serach : remaining try = " + tryCount); Thread.sleep(retryInterval); } } // ID AUTHOR DATE QUERY TEXT int resultTweetCnt = 0; for (QueryResult qr : res) { List<Tweet> list = qr.getTweets(); for (Tweet t : list) { bw.write(String.valueOf(t.getId())); bw.write('\t'); bw.write(t.getFromUser().replaceAll("\\s+", " ")); bw.write('\t'); bw.write(dateFormat.format(t.getCreatedAt())); bw.write('\t'); bw.write(query); bw.write('\t'); bw.write(t.getText().replaceAll("\\s+", " ")); bw.write('\n'); resultTweetCnt++; } } resultTweetTotalCnt += resultTweetCnt; logger.info("[" + (++queryProcessCnt) + "/" + queryTotalNum + "] " + resultTweetCnt + " tweets are returned. total=" + resultTweetTotalCnt); bw.flush(); Thread.sleep(interval); } bw.close(); logger.info("done"); } public void searchTwitterCountHashtag(String query, int rpp, int max, String lang) throws TwitterException, IOException { List<QueryResult> res = searchTwitter(query, rpp, max, lang); HashMap<String, Integer> tagCountMap = new HashMap<String, Integer>(); for (QueryResult qr : res) { List<Tweet> list = qr.getTweets(); for (Tweet t : list) { HashtagEntity[] tags = t.getHashtagEntities(); if (tags != null) { for (int i = 0; i < tags.length; i++) { String tag = tags[i].getText().toLowerCase(); Integer cnt = tagCountMap.get(tag); if (cnt == null) { tagCountMap.put(tag, 0); } else { tagCountMap.put(tag, cnt + 1); } } } } } Set<Entry<String,Integer>> set = tagCountMap.entrySet(); for (Entry<String,Integer> e : set) { logger.info(String.format("%04d\t%s", e.getValue(), e.getKey())); } } public void searchTwitterWriteObject(String dir, String fileName, String query, int rpp, int max, String lang) throws TwitterException, IOException { Calendar cal = Calendar.getInstance(); SimpleDateFormat sdf = new SimpleDateFormat(FILE_DATE_FORMAT); List<QueryResult> res = searchTwitter(query, rpp, max, lang); writeQueryResultListObject(res, String.format("%s/%s_%s_%s.object", dir, fileName, sdf.format(cal.getTime()), lang)); } public void searchTwitterWriteText(String file, String query, int rpp, int max, String lang) throws TwitterException, IOException { BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")); List<QueryResult> res = searchTwitter(query, rpp, max, lang); int i = 0; for (QueryResult qr : res) { List<Tweet> tweetList = qr.getTweets(); for (Tweet tweet : tweetList) { bw.write(query); bw.write('\t'); bw.write(tweet.getFromUser()); bw.write('\t'); bw.write(tweet.getText().replaceAll("\\s", " ").trim()); bw.write('\n'); logger.info(++i + "\t@" + tweet.getFromUser() + ": " + tweet.getText()); } } bw.close(); } public void printTweetsFromStoredResults(String file) throws IOException, ClassNotFoundException { ArrayList<QueryResult> res = loadQueryResult(file); int i = 1; for (QueryResult r : res) { List<Tweet> tweets = r.getTweets(); for (Tweet t : tweets) { logger.info(i++ + "\t@" + t.getFromUser() + ": " + t.getText()); } } } public List<QueryResult> searchTwitter(String query, int rpp, int max, String lang) throws TwitterException { logger.info("query to the twitter.com: query=" + query + " rpp=" + rpp + " max=" + max + " lang=" + lang); if (rpp < 1 || rpp > 100) { throw new TwitterException("rpp should be between 1 and 100"); } else if (max < rpp || max > 1500) { throw new TwitterException("max should be between rpp <= max <= 1500"); } Twitter twitter = new TwitterFactory().getInstance(); ArrayList<QueryResult> queryResultList = new ArrayList<QueryResult>(); Query q = new Query(); q.setQuery(query); q.setRpp(rpp); if (lang != null) { q.setLang(lang); } QueryResult result = twitter.search(q); queryResultList.add(result); int cnt = result.getResultsPerPage(); for ( ; cnt < max; cnt += result.getResultsPerPage()) { if (result.getResultsPerPage() != q.getRpp()) { break; } q.setPage(result.getPage() + 1); q.setMaxId(result.getMaxId()); result = twitter.search(q); if (result == null) { break; } queryResultList.add(result); } return queryResultList; } private void writeQueryResultListObject(List<QueryResult> resultList, String filePath) throws IOException{ FileOutputStream fos = new FileOutputStream(filePath); ObjectOutputStream oos = new ObjectOutputStream(fos); oos.writeObject(resultList); oos.close(); fos.close(); logger.info("query results were stored to '" + filePath + "'"); } public ArrayList<QueryResult> loadQueryResult(String filePath) throws IOException, ClassNotFoundException{ FileInputStream fis = new FileInputStream(filePath); ObjectInputStream ois = new ObjectInputStream(fis); @SuppressWarnings("unchecked") ArrayList<QueryResult> list = (ArrayList<QueryResult>) ois.readObject(); ois.close(); fis.close(); logger.info("Query results were loaded from '" + filePath + "': " + list.size() + " results"); return list; } public void openTwitterTrackStream(List<String> track, StatusListener listener, final int seconds) { final TwitterStream twitterStream = new TwitterStreamFactory().getInstance(); twitterStream.addListener(listener); String[] trackArray = track.toArray(new String[track.size()]); if (seconds != 0) { Thread th = new Thread() { public void run() { try { Thread.sleep(1000*seconds); logger.info("shutdown twitter stream - " + seconds + " seconds passed"); twitterStream.shutdown(); } catch (InterruptedException e) { logger.error(e); } } }; th.start(); logger.info("open twitter filter stream for " + seconds + " hours"); twitterStream.filter(new FilterQuery(0, null, trackArray)); try { th.join(); } catch (InterruptedException e) { logger.error(e); } } else { logger.info("open twitter filter stream"); twitterStream.filter(new FilterQuery(0, null, trackArray)); } } public void openTwitterSampleStream(StatusListener listener, final int seconds) { final TwitterStream twitterStream = new TwitterStreamFactory().getInstance(); twitterStream.addListener(listener); if (seconds != 0) { Thread th = new Thread() { public void run() { try { Thread.sleep(1000*seconds); logger.info("shutdown twitter stream - " + seconds + " seconds passed"); twitterStream.shutdown(); } catch (InterruptedException e) { logger.error(e); } } }; th.start(); logger.info("open twitter sample stream for " + seconds + " hours"); twitterStream.sample(); try { th.join(); } catch (InterruptedException e) { logger.error(e); } } else { logger.info("open twitter sample stream"); twitterStream.sample(); } } public static int[] resizeToLimitRandom(int corpusSize, int limit, long seed) { Random random = new Random(seed); Set<Integer> id = new HashSet<Integer>(); int cnt = 0; int rand = 0; if (cnt < limit) { rand = random.nextInt(corpusSize); if (!id.contains(rand)) { cnt++; id.add(rand); } } ArrayList<Integer> list = new ArrayList<Integer>(id); Collections.sort(list); int[] chosen = new int[list.size()]; int idx = 0; for (Integer i : list) { chosen[idx++] = i; } return chosen; } }