package com.maalaang.omtwitter.tools;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.Properties;
import org.apache.log4j.Logger;
import com.maalaang.omtwitter.io.OMTwitterCorpusFile;
import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader;
import com.maalaang.omtwitter.io.OMTwitterCorpusFileWriter;
import com.maalaang.omtwitter.model.OMTweet;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
public class TokenizeTwitterRawCorpus {
private Logger logger = null;
public static void main(String[] args) {
TokenizeTwitterRawCorpus crawler = new TokenizeTwitterRawCorpus();
try {
crawler.run(args[0]);
} catch (Exception e) {
e.printStackTrace();
}
}
public TokenizeTwitterRawCorpus() {
logger = Logger.getLogger(getClass());
}
public void run(String propFile) throws UnsupportedEncodingException, FileNotFoundException, IOException, NumberFormatException, InterruptedException {
Properties prop = new Properties();
prop.load(new InputStreamReader(new FileInputStream(propFile), "UTF-8"));
logger.info("tokenize tweets - " + prop.getProperty("raw.corpus.file"));
int fields[] = OMTwitterCorpusFile.fieldNameToId(prop.getProperty("raw.corpus.fields"), " ");
OMTwitterCorpusFileReader corpusReader = new OMTwitterCorpusFileReader(prop.getProperty("raw.corpus.file"), fields);
OMTwitterCorpusFileWriter corpusWriter = new OMTwitterCorpusFileWriter(prop.getProperty("raw.corpus.file.tokenized"), fields);
int tweetTotalCnt = 0;
while (corpusReader.hasNext()) {
OMTweet tweet = corpusReader.next();
String text = tweet.getText();
tweet.setText(tokenizeAndConcatText(text, " "));
corpusWriter.write(tweet);
tweetTotalCnt++;
}
corpusReader.close();
corpusWriter.close();
logger.info("total " + tweetTotalCnt + " tweets were written - " + prop.getProperty("raw.corpus.file.tokenized"));
}
private static String tokenizeAndConcatText(String text, String s) {
List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(text));
StringBuffer sb = null;
for (List<HasWord> sentence : sentences) {
for (HasWord word : sentence) {
if (sb == null) {
sb = new StringBuffer();
sb.append(word.word());
} else {
sb.append(s);
sb.append(word.word());
}
}
}
return sb.toString().replaceAll("\\\\/", "/").trim();
}
}