package wordcloud.nlp;
import ch.lambdaj.Lambda;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import wordcloud.WordFrequency;
import wordcloud.nlp.filter.StopWordFilter;
import wordcloud.nlp.sanitize.BasicTextSanitizer;
import wordcloud.nlp.sanitize.Sanitizer;
import wordcloud.nlp.tokenizer.WhiteSpaceWordTokenizer;
import wordcloud.nlp.tokenizer.WordTokenizer;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static ch.lambdaj.Lambda.on;
import static ch.lambdaj.Lambda.sort;
/**
* Created by kenny on 7/1/14.
*/
public class FrequencyAnalizer {
private static final int MAX_LENGTH = 32;
private StopWordFilter stopWordFilter = new StopWordFilter(Collections.EMPTY_SET);
private Sanitizer sanitizer = new BasicTextSanitizer();
private WordTokenizer wordTokenizer = new WhiteSpaceWordTokenizer();
private static final int DEFAULT_WORD_FREQUENCIES_TO_RETURN = 50;
private int wordFrequencesToReturn = DEFAULT_WORD_FREQUENCIES_TO_RETURN;
private int minWordLength = 3;
public List<WordFrequency> load(InputStream fileInputStream) throws IOException {
return load(IOUtils.readLines(fileInputStream));
}
public List<WordFrequency> load(URL url) throws IOException {
final Document doc = Jsoup.parse(url, 3 * 1000);
return load(Arrays.asList(doc.body().text()));
}
public List<WordFrequency> load(final List<String> texts) {
final List<WordFrequency> wordFrequencies = new ArrayList<>();
// generate all word counts
final Map<String, Integer> cloud = calculateCloud(texts, wordTokenizer);
for(String key : cloud.keySet()) {
if(key.length() >= minWordLength
&& key.length() < MAX_LENGTH) {
wordFrequencies.add(new WordFrequency(-1, key, "", cloud.get(key)));
}
}
return takeTopFrequencies(wordFrequencies);
}
private Map<String, Integer> calculateCloud(List<String> texts, WordTokenizer tokenizer) {
final Map<String, Integer> cloud = new HashMap<>();
for(String text : texts) {
final List<String> words = Lambda.filter(stopWordFilter, tokenizer.tokenize(sanitizer.sanitize(text)));
for(String word : words) {
word = StringUtils.trimToEmpty(word).toLowerCase();
if(StringUtils.isNotBlank(word)) {
if(cloud.containsKey(word)) {
cloud.put(word, cloud.get(word) + 1);
} else {
cloud.put(word, 1);
}
}
}
}
return cloud;
}
private List<WordFrequency> takeTopFrequencies(Collection<WordFrequency> wordCloudEntities) {
final List<WordFrequency> sorted = sort(wordCloudEntities, on(WordFrequency.class).getFrequency());
Collections.reverse(sorted);
if(sorted.isEmpty()) {
return sorted;
}
return sorted.subList(0, Math.min(sorted.size(), wordFrequencesToReturn));
}
public void setStopWords(Collection<String> stopWords) {
this.stopWordFilter = new StopWordFilter(stopWords);
}
public void setWordFrequencesToReturn(int wordFrequencesToReturn) {
this.wordFrequencesToReturn = wordFrequencesToReturn;
}
public void setMinWordLength(int minWordLength) {
this.minWordLength = minWordLength;
}
public void setSanitizer(Sanitizer sanitizer) {
this.sanitizer = sanitizer;
}
public void setWordTokenizer(WordTokenizer wordTokenizer) {
this.wordTokenizer = wordTokenizer;
}
}