/* * Copyright 2015 Evgeny Dolganov (evgenij.dolganov@gmail.com). * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package och.util.string; import static och.util.Util.*; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.List; import java.util.Random; import och.util.string.WordsCounter.WordStat; import org.junit.Test; import test.BaseTest; @org.junit.Ignore public class WordsCounterTest extends BaseTest { @org.junit.Ignore @Test public void test_big_counts_in_hdd() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); counter.setRemoveIndexesFiles(false); counter.setRamTopSize(5); String str = toWordsStr(list( new WordStat("a", 1000), new WordStat("b", 999), new WordStat("c", 999), new WordStat("d", 888), new WordStat("e", 777), new WordStat("f", 666) )); assertEquals("a-1000 b-999 c-999 d-888 e-777", toStr(counter.getTop(str, 5))); } @Test public void test_big_counts_in_ram() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); counter.setRemoveIndexesFiles(false); String str = toWordsStr(list( new WordStat("a", 1000), new WordStat("b", 999), new WordStat("c", 999), new WordStat("d", 888), new WordStat("e", 777), new WordStat("f", 666) )); assertEquals("a-1000", toStr(counter.getTop(str, 1))); assertEquals("a-1000 b-999", toStr(counter.getTop(str, 2))); assertEquals("a-1000 b-999 c-999", toStr(counter.getTop(str, 3))); assertEquals("a-1000 b-999 c-999 d-888", toStr(counter.getTop(str, 4))); assertEquals("a-1000 b-999 c-999 d-888 e-777", toStr(counter.getTop(str, 5))); assertEquals("a-1000 b-999 c-999 d-888 e-777 f-666", toStr(counter.getTop(str, 10))); } @Test public void test_fromExample() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); String words = "sergey_martynov " + "rustem_bedretdinov " + "sergey_martynov " + "guest123 " + "sergey_martynov " + "rustem_bedretdinov " + "BATMAN " + "batman "; List<WordStat> top = counter.getTop(words, 4); assertEquals("sergey_martynov-3 batman-2 rustem_bedretdinov-2 guest123-1", toStr(top)); } @Test public void test_uppercase() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); assertEquals("aa-3 b-2 c-1", toStr(counter.getTop("Aa b c AA aA B D", 3))); } @Test public void test_count() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); assertEquals("aa-3 b-2 c-1", toStr(counter.getTop("aa b c aa aa b d", 3))); } @Test public void test_top_word_max_size() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); assertEquals("a-1 aa-1 b-1", toStr(counter.getTop("a b c aa", 3, 2))); assertEquals("a-1 b-1 c-1", toStr(counter.getTop("a b c aa", 3, 1))); } @Test public void test_top_size() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); assertEquals("a-1 b-1 c-1 d-1", toStr(counter.getTop("a b c d", 5))); assertEquals("a-1 b-1 c-1 d-1", toStr(counter.getTop("a b c d", 4))); assertEquals("a-1 b-1 c-1", toStr(counter.getTop("a b c d", 3))); assertEquals("a-1 aa-1 b-1", toStr(counter.getTop("a b c aa", 3))); assertEquals("d-3 a-1 b-1", toStr(counter.getTop("a b c d d d", 3))); } @Test public void test_empty() throws IOException{ WordsCounter counter = new WordsCounter(); counter.setTmpIndexDirParent(TEST_DIR); assertEquals(0, counter.getTop((InputStream)null, 2).size()); assertEquals(0, counter.getTop(" \n \r\0 ", 2).size()); } private static String toWordsStr(List<WordStat> list){ Random r = new Random(); StringBuilder sb = new StringBuilder(); boolean isFirst = true; while(list.size() > 0){ if( ! isFirst) sb.append(' '); isFirst = false; int index = r.nextInt(list.size()); WordStat stat = list.get(index); sb.append(stat.word); stat.decCount(); if(stat.count() == 0){ list.remove(index); } } return sb.toString(); } private static String toStr(List<WordStat> top) { StringBuilder sb = new StringBuilder(); boolean isFirst = true; for (WordStat stat : top) { if( ! isFirst) sb.append(' '); isFirst = false; sb.append(stat.word).append('-').append(stat.count()); } return sb.toString(); } public static void main(String[] args) throws IOException { long start = System.currentTimeMillis(); //File bigFile = new File("../tmp/tolst-vojna-mir.txt"); //File bigFile = new File("../tmp/tolk-slovar.txt"); File bigFile = new File("../tmp/access_log.txt"); WordsCounter counter = new WordsCounter(); counter.setDebug(false); counter.setTmpIndexDirParent(new File("../tmp")); counter.setRemoveIndexesFiles(false); //counter.setRamTopSize(10_000); List<WordStat> result = counter.getTop(new FileInputStream(bigFile), 100, 50); //print results for (WordStat stat : result) { System.out.println(stat.word + " " + stat.count()); } System.out.println((System.currentTimeMillis() - start)/1000+" sec"); } }