package com.kennycason.kumo.nlp; import ch.lambdaj.Lambda; import com.kennycason.kumo.WordFrequency; import com.kennycason.kumo.nlp.filter.Filter; import com.kennycason.kumo.nlp.filter.StopWordFilter; import com.kennycason.kumo.nlp.normalize.CharacterStrippingNormalizer; import com.kennycason.kumo.nlp.normalize.LowerCaseNormalizer; import com.kennycason.kumo.nlp.normalize.Normalizer; import com.kennycason.kumo.nlp.tokenizer.WhiteSpaceWordTokenizer; import com.kennycason.kumo.nlp.tokenizer.WordTokenizer; import org.apache.commons.io.IOUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import com.kennycason.kumo.nlp.filter.CompositeFilter; import com.kennycason.kumo.nlp.filter.WordSizeFilter; import com.kennycason.kumo.nlp.normalize.TrimToEmptyNormalizer; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.*; import java.util.Map.Entry; import static ch.lambdaj.Lambda.on; import static ch.lambdaj.Lambda.sort; /** * Created by kenny on 7/1/14. */ public class FrequencyAnalyzer { public static final String DEFAULT_ENCODING = "UTF-8"; public static final int DEFAULT_WORD_MAX_LENGTH = 32; public static final int DEFAULT_WORD_MIN_LENGTH = 3; public static final int DEFAULT_WORD_FREQUENCIES_TO_RETURN = 50; public static final long DEFAULT_URL_LOAD_TIMEOUT = 3000; // 3 sec private final Set<String> stopWords = new HashSet<>(); private WordTokenizer wordTokenizer = new WhiteSpaceWordTokenizer(); private final List<Filter> filters = new ArrayList<>(); private final List<Normalizer> normalizers = new ArrayList<>(); private int wordFrequenciesToReturn = DEFAULT_WORD_FREQUENCIES_TO_RETURN; private int maxWordLength = DEFAULT_WORD_MAX_LENGTH; private int minWordLength = DEFAULT_WORD_MIN_LENGTH; private String characterEncoding = DEFAULT_ENCODING; private long urlLoadTimeout = DEFAULT_URL_LOAD_TIMEOUT; public FrequencyAnalyzer() { this.normalizers.add(new TrimToEmptyNormalizer()); this.normalizers.add(new CharacterStrippingNormalizer()); this.normalizers.add(new LowerCaseNormalizer()); } public List<WordFrequency> load(final InputStream fileInputStream) throws IOException { return load(IOUtils.readLines(fileInputStream, characterEncoding)); } public List<WordFrequency> load(final File file) throws IOException { return this.load(new FileInputStream(file)); } public List<WordFrequency> load(final String filePath) throws IOException { return this.load(new File(filePath)); } public List<WordFrequency> load(final URL url) throws IOException { final Document doc = Jsoup.parse(url, (int) urlLoadTimeout); return load(Collections.singletonList(doc.body().text())); } public List<WordFrequency> load(final List<String> texts) { final List<WordFrequency> wordFrequencies = new ArrayList<>(); final Map<String, Integer> cloud = buildWordFrequencies(texts, wordTokenizer); for (final Entry<String, Integer> wordCount : cloud.entrySet()) { wordFrequencies.add(new WordFrequency(wordCount.getKey(), wordCount.getValue())); } return takeTopFrequencies(wordFrequencies); } public List<WordFrequency> loadWordFrequencies(final List<WordFrequency> wflist) { return takeTopFrequencies(wflist); } private Map<String, Integer> buildWordFrequencies(final List<String> texts, final WordTokenizer tokenizer) { final Map<String, Integer> wordFrequencies = new HashMap<>(); for (final String text : texts) { final List<String> words = filter(tokenizer.tokenize(text)); for (final String word : words) { final String normalized = normalize(word); if (!wordFrequencies.containsKey(normalized)) { wordFrequencies.put(normalized, 1); } wordFrequencies.put(normalized, wordFrequencies.get(normalized) + 1); } } return wordFrequencies; } private List<String> filter(final List<String> words) { final List<Filter> allFilters = new ArrayList<>(); allFilters.add(new StopWordFilter(stopWords)); allFilters.add(new WordSizeFilter(minWordLength, maxWordLength)); allFilters.addAll(filters); final CompositeFilter compositeFilter = new CompositeFilter(allFilters); return Lambda.filter(compositeFilter, words); } private String normalize(final String word) { String normalized = word; for (Normalizer normalizer : normalizers) { normalized = normalizer.normalize(normalized); } return normalized; } private List<WordFrequency> takeTopFrequencies(final Collection<WordFrequency> wordCloudEntities) { if (wordCloudEntities.isEmpty()) { return Collections.emptyList(); } final List<WordFrequency> sorted = sort(wordCloudEntities, on(WordFrequency.class).getFrequency()); Collections.reverse(sorted); return sorted.subList(0, Math.min(sorted.size(), wordFrequenciesToReturn)); } public void setStopWords(final Collection<String> stopWords) { this.stopWords.clear(); this.stopWords.addAll(stopWords); } public void setWordFrequenciesToReturn(final int wordFrequenciesToReturn) { this.wordFrequenciesToReturn = wordFrequenciesToReturn; } public void setMinWordLength(final int minWordLength) { this.minWordLength = minWordLength; } public void setMaxWordLength(final int maxWordLength) { this.maxWordLength = maxWordLength; } public void setWordTokenizer(final WordTokenizer wordTokenizer) { this.wordTokenizer = wordTokenizer; } public void clearFilters() { this.filters.clear(); } public void addFilter(final Filter filter) { this.filters.add(filter); } public void setFilter(final Filter filter) { this.filters.clear(); this.filters.add(filter); } public void clearNormalizers() { this.normalizers.clear(); } public void addNormalizer(final Normalizer normalizer) { this.normalizers.add(normalizer); } public void setNormalizer(final Normalizer normalizer) { this.normalizers.clear(); this.normalizers.add(normalizer); } public void setCharacterEncoding(final String characterEncoding) { this.characterEncoding = characterEncoding; } public void setUrlLoadTimeout(final long urlLoadTimeout) { this.urlLoadTimeout = urlLoadTimeout; } }