/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.tw;
import de.jetwick.data.JTweet;
import de.jetwick.tw.cmd.StringFreqMap;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
/**
* This class tries to detect the language and stores terms of the specified tweets.
*
* Used while indexing to store the language and the terms as additional field attributes.
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class TweetDetector {
public static final String MISC_TERMS = "misc";
public static final String NUM_TERMS = "num";
public static final String SINGLE_CHAR_TERMS = "1";
public static final String UNKNOWN_LANG = "unknown";
public static final String DE = "de";
public static final String NL = "nl";
public static final String EN = "en";
public static final String RU = "ru";
public static final String ES = "es";
public static final String FR = "fr";
public static final String PT = "pt";
public static final Set<String> LANGS = new LinkedHashSet<String>(Arrays.asList(DE, NL, EN, RU, ES, FR, PT));
private Collection<JTweet> tweets;
private int termMaxCount = 6;
private StringFreqMap languages = new StringFreqMap(4);
private StringFreqMap terms = new StringFreqMap(8);
public TweetDetector(Collection<JTweet> tweets) {
this.tweets = tweets;
}
public TweetDetector() {
}
/**
* To create symbol-free terms
*
* TODO PERFORMANCE expensive method
*/
public static String stripNoiseFromWord(String str) {
if (str.length() < 2)
return str;
// remove highlighting
str = str.replaceAll("<b>", "");
str = str.replaceAll("</b>", "");
// ignore urls
// urls contain all characters except spaces: [^ ] and we need this multple times: *
str = str.replaceAll("http[s]?://[^ ]*", " ");
str = str.replaceAll("[\\\"\\:\\;\\&\\.\\!\\?\\)\\(\\[\\]\\,\\>\\<\\-\\n\\t\\&]", " ");
str = str.replaceAll(" #", " ");
// or at the beginning of the line
if (str.charAt(0) == '#')
str = str.substring(1);
str = str.replaceAll("^#", " ");
str = str.replaceAll(":-", " ");
str = str.replaceAll(";-", " ");
return str;
}
public List<Entry<String, Integer>> getSortedTerms() {
return terms.getSortedTermLimited(termMaxCount);
}
public StringFreqMap getTerms() {
return terms;
}
public TweetDetector setTermMaxCount(int tagLimit) {
this.termMaxCount = tagLimit;
return this;
}
public StringFreqMap getLanguages() {
return languages;
}
public TweetDetector run() {
languages.clear();
Map<String, Integer> termMap = new LinkedHashMap<String, Integer>();
for (JTweet tweet : tweets) {
termMap.clear();
oneTweet(termMap, languages, tweet);
// if one tweet has several terms 'java' increase the term only once!
for (Entry<String, Integer> entry : termMap.entrySet()) {
Integer integ = terms.get(entry.getKey());
if (integ != null)
terms.put(entry.getKey(), integ + 1);
else
terms.put(entry.getKey(), 1);
}
}
return this;
}
public TweetDetector runOne(String text) {
languages.clear();
oneTweet(terms, languages, text.toLowerCase());
return this;
}
private void oneTweet(Map<String, Integer> termMap, Map<String, Integer> langMap, JTweet tweet) {
oneTweet(termMap, langMap, tweet.getText().toLowerCase());
}
private void oneTweet(Map<String, Integer> termMap, Map<String, Integer> langMap, String text) {
// split against white space characters
text = stripNoiseFromWord(text);
String tmpTerms[] = text.split("\\s");
// Analyzer ana = new JetwickAnalyzer();
// TokenStream ts = ana.tokenStream("tw", new StringReader(text));
// CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
int counter = 0;
for (String term : tmpTerms) {
counter++;
if (term.length() < 2 || term.length() > 70 || term.startsWith("@"))
continue;
Set<String> detectedLangs = JTweet.LANG_DET_WORDS.get(term);
if (langMap != null && detectedLangs != null) {
// skip the last term for language detection
if (counter < tmpTerms.length) {
for (String lang : detectedLangs) {
if (lang.equals(TweetDetector.NUM_TERMS)
|| lang.equals(TweetDetector.SINGLE_CHAR_TERMS)
|| lang.equals(TweetDetector.MISC_TERMS))
continue;
Integer integ = langMap.put(lang, 1);
if (integ != null)
langMap.put(lang, integ + 1);
}
}
}
Set<String> noiseWordLangs = JTweet.NOISE_WORDS.get(term);
if (termMap != null && noiseWordLangs == null) {
Integer integ = termMap.put(term, 1);
if (integ != null)
termMap.put(term, integ + 1);
}
}
}
}