package com.levelup.java.io; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.nio.file.Paths; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; import org.junit.Before; import org.junit.Test; import com.google.common.base.CharMatcher; import com.google.common.base.Charsets; import com.google.common.base.Splitter; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; import com.google.common.io.Files; /** * This java example will demonstrate how to count * distinct word occurrences in a file. * * @author Justin Musgrove * @see <a href='http://www.leveluplunch.com/java/examples/count-distinct-word-occurrences-in-file/'>Count words in file</a> */ public class CountWordOccurrencesInFile { private static final Logger logger = Logger.getLogger(CountWordOccurrencesInFile.class); private static final String SOURCE = "com/levelup/java/io/word-occurrences-in-file.txt"; private URI sourceFileURI; @Before public void setUp () throws URISyntaxException { sourceFileURI = this.getClass().getClassLoader().getResource(SOURCE).toURI(); } @Test public void distinct_words_in_file_java() throws IOException { File file = new File(sourceFileURI); List<String> lines = java.nio.file.Files.readAllLines( Paths.get(file.toURI()), Charsets.UTF_8); Map<String, Integer> wordOccurrences = new HashMap<String, Integer>(); // for each line in file for (String line : lines) { String[] words = line.split(" "); // for every word in file for (String word : words) { word = word.replace(".", ""); if (!word.trim().isEmpty()) { if (wordOccurrences.containsKey(word)) { int count = wordOccurrences.get(word).intValue(); wordOccurrences.put(word, new Integer(count + 1)); } else { wordOccurrences.put(word, new Integer(1)); } } } } logger.info(wordOccurrences); assertEquals(80, wordOccurrences.size()); } @Test public void count_distinct_words_java8() throws IOException { File file = new File(sourceFileURI); long uniqueWords = java.nio.file.Files .lines(Paths.get(file.toURI()), Charset.defaultCharset()) .flatMap(line -> Arrays.stream(line.split(" ."))).distinct() .count(); assertEquals(80, uniqueWords); } /** * Example was modified from the guava site to remove * periods * * @throws IOException */ @Test public void count_distinct_words_in_file_guava () throws IOException { File file = new File(sourceFileURI); Multiset<String> wordOccurrences = HashMultiset.create( Splitter.on(CharMatcher.WHITESPACE) .trimResults(CharMatcher.is('.')) .omitEmptyStrings() .split(Files.asCharSource(file, Charsets.UTF_8).read())); logger.info(wordOccurrences); assertEquals(80, wordOccurrences.elementSet().size()); } }