package com.maalaang.omtwitter.tools; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import org.apache.log4j.Level; import org.apache.log4j.Logger; import com.maalaang.omtwitter.io.CollectionTextReader; import com.maalaang.omtwitter.io.LogSystemStream; import com.maalaang.omtwitter.io.OMTwitterCorpusFile; import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader; import com.maalaang.omtwitter.io.OMTwitterReader; import com.maalaang.omtwitter.model.OMTweet; import com.maalaang.omtwitter.text.FilterCosineSimilarity; import com.maalaang.omtwitter.text.FilterDomainRelevance; import com.maalaang.omtwitter.text.FilterHashtagUsage; import com.maalaang.omtwitter.text.FilterStopword; import com.maalaang.omtwitter.text.FilterUserName; import com.maalaang.omtwitter.text.TweetFilterPipeline; import com.maalaang.omtwitter.text.WordPattern; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.tagger.maxent.MaxentTagger; public class ConstructTwitterNamedEntityCorpus { private MaxentTagger tagger = null; private BufferedWriter bw = null; private Properties prop = null; private Logger logger = null; private Map<String,String> valueToPropertyMap = null; private Map<String,String> propertyToLabelMap = null; private int valueMinToken = 0; private int valueMaxToken = 0; private String nonelabel = null; private double mergeRate = 0.0; private Set<String> searchQuerySet = null; private String searchQueryToProperty = null; public static void main(String[] args) { try { Properties prop = new Properties(); prop.load(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")); LogSystemStream.redirectErrToLog(Level.ERROR); ConstructTwitterNamedEntityCorpus con = new ConstructTwitterNamedEntityCorpus(prop); con.run(); } catch (Exception e) { e.printStackTrace(); } } public ConstructTwitterNamedEntityCorpus(Properties prop) throws ClassNotFoundException, IOException { this.prop = prop; this.tagger = new MaxentTagger(MaxentTagger.DEFAULT_JAR_PATH); this.bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(prop.getProperty("ne.corpus.file")), "UTF-8")); this.logger = Logger.getLogger(getClass()); this.valueToPropertyMap = valueToPropertyMap(prop.getProperty("value.to.property.map.file")); this.propertyToLabelMap = CollectionTextReader.readMapStringString(prop.getProperty("ne.corpus.property.label.map.file")); this.valueMinToken = Integer.parseInt(prop.getProperty("value.token.min")); this.valueMaxToken = Integer.parseInt(prop.getProperty("value.token.max")); this.nonelabel = prop.getProperty("ne.corpus.label.none"); this.mergeRate = Double.parseDouble(prop.getProperty("ne.corpus.merge.rate")); this.searchQuerySet = searchQuerySet(prop.getProperty("raw.corpus.search.query.map.file")); this.searchQueryToProperty = prop.getProperty("raw.corpus.search.query.to.property"); } public void run() throws IOException { Map<String,Double> wrsMap = CollectionTextReader.readMapStringDouble(prop.getProperty("word.relevance.score.file")); Set<String> stopwords = CollectionTextReader.readSetString(prop.getProperty("stopword.set.file")); // search corpus int[] searchCorpusFields = new int[] { OMTwitterCorpusFile.FIELD_ID, OMTwitterCorpusFile.FIELD_AUTHOR, OMTwitterCorpusFile.FIELD_DATE, OMTwitterCorpusFile.FIELD_QUERY, OMTwitterCorpusFile.FIELD_TEXT }; OMTwitterReader searchCorpusReader = new OMTwitterCorpusFileReader(prop.getProperty("raw.corpus.search.file"), searchCorpusFields); TweetFilterPipeline searchCorpusFilterPipe = new TweetFilterPipeline(); searchCorpusFilterPipe.add(new FilterUserName(Integer.parseInt(prop.getProperty("raw.corpus.search.filter.user.name.window.size")), Integer.parseInt(prop.getProperty("raw.corpus.search.filter.user.name.post.limit")))); searchCorpusFilterPipe.add(new FilterHashtagUsage()); searchCorpusFilterPipe.add(new FilterCosineSimilarity(Integer.parseInt(prop.getProperty("raw.corpus.search.filter.cosine.similarity.window.size")), Double.parseDouble(prop.getProperty("raw.corpus.search.filter.cosine.similarity.threshold")))); searchCorpusFilterPipe.add(new FilterDomainRelevance(wrsMap, stopwords, Double.parseDouble(prop.getProperty("raw.corpus.search.filter.domain.relevance.relevance.factor")), Integer.parseInt(prop.getProperty("raw.corpus.search.filter.domain.relevance.window.size")), Double.parseDouble(prop.getProperty("raw.corpus.search.filter.domain.relevance.start.window.score")))); searchCorpusFilterPipe.initialize(); int searchCorpusWriteCnt = 0; while (searchCorpusReader.hasNext()) { OMTweet tweet = searchCorpusReader.next(); if (searchCorpusFilterPipe.check(tweet)) { writeNEAnnotatedTweet(tweet); searchCorpusWriteCnt++; } } searchCorpusFilterPipe.close(); searchCorpusReader.close(); // sample corpus int[] sampleCorpusFields = new int[] { OMTwitterCorpusFile.FIELD_ID, OMTwitterCorpusFile.FIELD_AUTHOR, OMTwitterCorpusFile.FIELD_DATE, OMTwitterCorpusFile.FIELD_TEXT }; OMTwitterReader sampleCorpusReader = new OMTwitterCorpusFileReader(prop.getProperty("raw.corpus.sample.file"), sampleCorpusFields); TweetFilterPipeline sampleCorpusFilterPipe = new TweetFilterPipeline(); sampleCorpusFilterPipe.add(new FilterUserName(Integer.parseInt(prop.getProperty("raw.corpus.sample.filter.user.name.window.size")), Integer.parseInt(prop.getProperty("raw.corpus.sample.filter.user.name.post.limit")))); sampleCorpusFilterPipe.add(new FilterStopword(stopwords, Integer.parseInt(prop.getProperty("raw.corpus.sample.filter.stopword.threshold")))); sampleCorpusFilterPipe.add(new FilterCosineSimilarity(Integer.parseInt(prop.getProperty("raw.corpus.sample.filter.cosine.similarity.window.size")), Double.parseDouble(prop.getProperty("raw.corpus.sample.filter.cosine.similarity.threshold")))); sampleCorpusFilterPipe.add(new FilterDomainRelevance(wrsMap, stopwords, Double.parseDouble(prop.getProperty("raw.corpus.sample.filter.domain.relevance.relevance.factor")), Integer.parseInt(prop.getProperty("raw.corpus.sample.filter.domain.relevance.window.size")), Double.parseDouble(prop.getProperty("raw.corpus.sample.filter.domain.relevance.start.window.score")), true)); sampleCorpusFilterPipe.initialize(); int sampleCorpusWriteCnt = 0; int sampleCorpusWriteLimit = (int)(searchCorpusWriteCnt * mergeRate); while (sampleCorpusReader.hasNext()) { OMTweet tweet = sampleCorpusReader.next(); if (sampleCorpusFilterPipe.check(tweet)) { writeNEAnnotatedTweet(tweet); if (++sampleCorpusWriteCnt > sampleCorpusWriteLimit) { break; } } } sampleCorpusFilterPipe.close(); sampleCorpusReader.close(); } private void writeNEAnnotatedTweet(OMTweet tweet) throws IOException { String text = tweet.getText(); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(text)); int wordIdx = 0; String query = null; bw.write(tweet.getId()); bw.write('\t'); bw.write(tweet.getAuthor()); bw.write('\t'); bw.write(tweet.getDateString()); bw.write('\t'); if ((query = tweet.getQuery()) != null) { bw.write(query); bw.write('\t'); } else { bw.write(OMTwitterCorpusFile.FIELD_EMPTY_STR); bw.write('\t'); } for (List<HasWord> sentence : sentences) { ArrayList<TaggedWord> taggedSentence = tagger.tagSentence(sentence); // to exclude escaping characters from PTBLexer; it can be improved for (TaggedWord word : taggedSentence) { word.setWord(text.substring(word.beginPosition(), word.endPosition())); } String[] neTags = tagNESentence(taggedSentence); int neTagsIdx = 0; for (TaggedWord word : taggedSentence) { if (wordIdx++ > 0) bw.write(' '); bw.write(word.word()); bw.write('/'); bw.write(word.tag()); bw.write('/'); bw.write(neTags[neTagsIdx]); neTagsIdx++; } } bw.write('\n'); bw.flush(); } private String[] tagNESentence(List<TaggedWord> sent) { String[] tags = new String[sent.size()]; String[] normWords = new String[sent.size()]; int idx = 0; for (TaggedWord word : sent) { normWords[idx++] = WordPattern.normalize(word.word()); } int start; int end; int endMin; int endMax; String key = null; String tag = null; for (int i = 0; i < tags.length; i++) { start = i; if ((endMin = i + valueMinToken) > tags.length) { for (int k = start; k < tags.length; i++) { tags[k] = nonelabel; } break; } if ((endMax = i + valueMaxToken) > tags.length) { endMax = tags.length; } for (end = endMax; end >= endMin; end--) { for (int j = start; j < end; j++) { if (j == start) { key = normWords[j]; } else { key += " " + normWords[j]; } } if (searchQueryToProperty != null && searchQuerySet.contains(key)) { tag = searchQueryToProperty; logger.debug(key + " > " + searchQueryToProperty); break; } else if ((tag = valueToPropertyMap.get(key)) != null) { logger.debug(key + " > " + tag); break; } } if (tag == null) { tags[i] = nonelabel; } else { tag = propertyToLabelMap.get(tag); int k = start; tags[k++] = tag + "_B"; for ( ; k < end - 1; k++) { tags[k] = tag + "_M"; } if (k == end - 1) { tags[k] = tag + "_E"; } i = end - 1; } } return tags; } private Set<String> searchQuerySet(String file) throws IOException { HashSet<String> set = new HashSet<String>(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); String line = null; while ((line = br.readLine()) != null) { String[] tokens = line.split("\t"); String w = tokens[0].replaceAll("\"", ""); String w1 = WordPattern.replaceRomanToArabic(w); set.add(w.replaceFirst("#", "")); set.add(w1.replaceFirst("#", "")); w = tokenizeAndConcatText(w, " "); w1 = tokenizeAndConcatText(w1, " "); set.add(w.replaceFirst("#", "")); set.add(w1.replaceFirst("#", "")); } br.close(); for (String s : set) { System.out.println(s); } return set; } private static Map<String, String> valueToPropertyMap(String file) throws IOException { HashMap<String, String> map = new HashMap<String, String>(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); String line = null; while ((line = br.readLine()) != null) { String[] tokens = line.split("\t"); map.put(tokens[0].trim(), tokens[1].trim()); map.put(tokenizeAndConcatText(tokens[0].trim(), " "), tokens[1].trim()); System.out.println(tokens[0].trim()); System.out.println(tokenizeAndConcatText(tokens[0].trim(), " ")); } br.close(); return map; } private static String tokenizeAndConcatText(String text, String s) { List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(text)); StringBuffer sb = null; for (List<HasWord> sentence : sentences) { for (HasWord word : sentence) { if (sb == null) { sb = new StringBuffer(); sb.append(word.word()); } else { sb.append(s); sb.append(word.word()); } } } return sb.toString().replaceAll("\\\\/", "/").trim(); } public void close() { try { bw.close(); } catch (IOException e) { logger.error(e); } } }