package com.maalaang.omtwitter.tools; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.Map; import java.util.Properties; import java.util.Set; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.maalaang.omtwitter.corpus.TextWriteStatusListener; import com.maalaang.omtwitter.corpus.TwitterCorpusConstructor; import com.maalaang.omtwitter.corpus.TwitterCorpusStat; import com.maalaang.omtwitter.corpus.TwitterQueryGenerator; import com.maalaang.omtwitter.io.CollectionTextReader; import com.maalaang.omtwitter.io.CollectionTextWriter; import com.maalaang.omtwitter.io.OMTwitterCorpusFile; public class ConstructTwitterRawCorpus { public static void main(String[] args) { try { Properties prop = new Properties(); prop.load(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")); // search Model domainOntologyModel = ModelFactory.createDefaultModel(); InputStreamReader isr = new InputStreamReader(new FileInputStream(prop.getProperty("domain.ontology.file")), "UTF-8"); domainOntologyModel.read(isr, null, "N-TRIPLE"); isr.close(); Set<String> stopwords = CollectionTextReader.readSetString(prop.getProperty("stopword.set.file")); Map<String,Set<String>> queryToResourceMap = TwitterQueryGenerator.generateQueries(domainOntologyModel, stopwords, prop.getProperty("query.gen.resource.label.lang"), Integer.parseInt(prop.getProperty("param.query.min.length"))); CollectionTextWriter.writeMapStringSetString(queryToResourceMap, prop.getProperty("raw.corpus.search.query.map.file"), true); TwitterCorpusConstructor tcc = new TwitterCorpusConstructor(); tcc.constructCorpusBySearch(queryToResourceMap.keySet(), prop.getProperty("raw.corpus.search.file"), Integer.parseInt(prop.getProperty("raw.corpus.search.rpp")), Integer.parseInt(prop.getProperty("raw.corpus.search.max")), prop.getProperty("raw.corpus.search.lang", null), Integer.parseInt(prop.getProperty("raw.corpus.search.interval")), Integer.parseInt(prop.getProperty("raw.corpus.search.retry.num")), Integer.parseInt(prop.getProperty("raw.corpus.search.retry.interval"))); queryToResourceMap = null; int[] searchCorpusFields = new int[] { OMTwitterCorpusFile.FIELD_ID, OMTwitterCorpusFile.FIELD_AUTHOR, OMTwitterCorpusFile.FIELD_DATE, OMTwitterCorpusFile.FIELD_QUERY, OMTwitterCorpusFile.FIELD_TEXT }; Map<String,Integer> searchUserStatusFreqMap = TwitterCorpusStat.userStatusFreq(prop.getProperty("raw.corpus.search.file"), "\\s+", searchCorpusFields); CollectionTextWriter.writeMapStringInteger(searchUserStatusFreqMap, prop.getProperty("raw.corpus.search.user.freq.file"), true); // sample stream tcc.openTwitterSampleStream(new TextWriteStatusListener(prop.getProperty("raw.corpus.sample.file"), prop.getProperty("raw.corpus.sample.lang")), 60*60*24); int[] sampleCorpusFields = new int[] { OMTwitterCorpusFile.FIELD_ID, OMTwitterCorpusFile.FIELD_AUTHOR, OMTwitterCorpusFile.FIELD_DATE, OMTwitterCorpusFile.FIELD_TEXT }; Map<String,Integer> sampleUserStatusFreqMap = TwitterCorpusStat.userStatusFreq(prop.getProperty("raw.corpus.sample.file"), "\\s+", sampleCorpusFields); CollectionTextWriter.writeMapStringInteger(sampleUserStatusFreqMap, prop.getProperty("raw.corpus.sample.user.freq.file"), true); } catch (Exception e) { e.printStackTrace(); } } }