package com.maalaang.omtwitter.tools; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.HashSet; import java.util.Properties; import java.util.Set; import org.apache.log4j.Logger; import com.maalaang.omtwitter.corpus.TwitterCorpusConstructor; import com.maalaang.omtwitter.io.OMTwitterCorpusFile; import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader; import com.maalaang.omtwitter.io.OMTwitterCorpusFileWriter; import com.maalaang.omtwitter.io.OMTwitterReader; import com.maalaang.omtwitter.model.OMTweet; import com.maalaang.omtwitter.text.FilterCosineSimilarity; import com.maalaang.omtwitter.text.FilterTweetId; import com.maalaang.omtwitter.text.FilterUserName; import com.maalaang.omtwitter.text.TweetFilterPipeline; public class CrawlTweetsBySearch { private Logger logger = null; public static void main(String[] args) { CrawlTweetsBySearch crawler = new CrawlTweetsBySearch(); try { crawler.run(args[0]); } catch (Exception e) { e.printStackTrace(); } } public CrawlTweetsBySearch() { logger = Logger.getLogger(getClass()); } public void run(String propFile) throws UnsupportedEncodingException, FileNotFoundException, IOException, NumberFormatException, InterruptedException { Properties prop = new Properties(); prop.load(new InputStreamReader(new FileInputStream(propFile), "UTF-8")); String entityNames = prop.getProperty("raw.corpus.search.query.entity"); String props = prop.getProperty("raw.corpus.search.query.property"); Set<String> querySet = searchQuery(entityNames.split(";"), props.split(";")); logger.info("start to crawl tweets from Twitter with " + querySet.size() + " queries: " + prop.getProperty("raw.corpus.search.file")); // search TwitterCorpusConstructor tcc = new TwitterCorpusConstructor(); tcc.constructCorpusBySearch(querySet, prop.getProperty("raw.corpus.search.file"), Integer.parseInt(prop.getProperty("raw.corpus.search.rpp")), Integer.parseInt(prop.getProperty("raw.corpus.search.max")), prop.getProperty("raw.corpus.search.lang", null), Integer.parseInt(prop.getProperty("raw.corpus.search.interval")), Integer.parseInt(prop.getProperty("raw.corpus.search.retry.num")), Integer.parseInt(prop.getProperty("raw.corpus.search.retry.interval"))); logger.info("start to filter collected tweets - " + prop.getProperty("raw.corpus.search.file.filtered")); // filtering int[] searchCorpusFields = new int[] { OMTwitterCorpusFile.FIELD_ID, OMTwitterCorpusFile.FIELD_AUTHOR, OMTwitterCorpusFile.FIELD_DATE, OMTwitterCorpusFile.FIELD_QUERY, OMTwitterCorpusFile.FIELD_TEXT }; OMTwitterReader searchCorpusReader = new OMTwitterCorpusFileReader(prop.getProperty("raw.corpus.search.file"), searchCorpusFields); TweetFilterPipeline searchCorpusFilterPipe = new TweetFilterPipeline(); searchCorpusFilterPipe.add(new FilterTweetId()); searchCorpusFilterPipe.add(new FilterUserName(Integer.parseInt(prop.getProperty("raw.corpus.search.filter.user.name.window.size")), Integer.parseInt(prop.getProperty("raw.corpus.search.filter.user.name.post.limit")))); searchCorpusFilterPipe.add(new FilterCosineSimilarity(Integer.parseInt(prop.getProperty("raw.corpus.search.filter.cosine.similarity.window.size")), Double.parseDouble(prop.getProperty("raw.corpus.search.filter.cosine.similarity.threshold")))); searchCorpusFilterPipe.initialize(); OMTwitterCorpusFileWriter corpusWriter = new OMTwitterCorpusFileWriter(prop.getProperty("raw.corpus.search.file.filtered"), searchCorpusFields); int tweetTotalCnt = 0; int tweetWriteCnt = 0; while (searchCorpusReader.hasNext()) { OMTweet tweet = searchCorpusReader.next(); tweetTotalCnt++; if (searchCorpusFilterPipe.check(tweet)) { corpusWriter.write(tweet); tweetWriteCnt++; } } searchCorpusFilterPipe.close(); searchCorpusReader.close(); corpusWriter.close(); logger.info("total " + tweetTotalCnt + " tweets, " + tweetWriteCnt + " tweets were written, " + (tweetTotalCnt - tweetWriteCnt) + " tweets were filtered out"); } private Set<String> searchQuery(String[] entityNames, String[] props) { HashSet<String> set = new HashSet<String>(); for (String e : entityNames) { for (String p : props) { if (set.add(e + " " + p)) { logger.info("created query - " + e + " " + p); } set.add(e); } } return set; } }