/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval.tokens;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.BeforeClass;
import org.junit.Test;
public class TokenCounterTest {
private final static String FIELD = "f";
private static AnalyzerManager analyzerManager;
private final int topN = 10;
@BeforeClass
public static void setUp() throws IOException {
analyzerManager = AnalyzerManager.newInstance(100000);
}
@Test
public void testBasic() throws Exception {
String s = " bde cde def abc efg f f f f ghijklmnop a a a a a a a a a a a a a a a a a b b b b b b b b b b b b b";
TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer());
counter.add(FIELD, s);
TokenStatistics simpleTokenStatistics = counter.getTokenStatistics(FIELD);
LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer());
tokenCounter.add(FIELD, s);
assertEquals(simpleTokenStatistics, tokenCounter.getTokenStatistics(FIELD));
}
@Test
public void testRandom() throws Exception {
long simple = 0;
long lucene = 0;
int numberOfTests = 100;
for (int i = 0; i < numberOfTests; i++) {
String s = generateString();
long start = new Date().getTime();
TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer());
counter.add(FIELD, s);
simple += new Date().getTime()-start;
TokenStatistics simpleTokenStatistics = counter.getTokenStatistics(FIELD);
start = new Date().getTime();
LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer());
tokenCounter.add(FIELD, s);
lucene += new Date().getTime()-start;
assertEquals(s, simpleTokenStatistics, tokenCounter.getTokenStatistics(FIELD));
}
}
@Test
public void testCommonTokens() throws Exception {
TokenCounter tokenCounter = new TokenCounter(analyzerManager.getCommonTokensAnalyzer());
String s = "the http://www.cnn.com and blahdeblah@apache.org are in valuable www.sites.org 普林斯顿大学";
tokenCounter.add(FIELD, s);
Map<String, MutableInt> tokens = tokenCounter.getTokens(FIELD);
assertEquals(new MutableInt(2), tokens.get("___url___"));
assertEquals(new MutableInt(1), tokens.get("___email___"));
}
@Test
public void testCJKFilter() throws Exception {
String s = "then quickbrownfoxjumpedoverthelazy dogss dog 普林斯顿大学";
Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
TokenStream ts = analyzer.tokenStream(FIELD, s);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
ts.reset();
Map<String, Integer> tokens = new HashMap<>();
while (ts.incrementToken()) {
String t = termAtt.toString();
Integer count = tokens.get(t);
count = (count == null) ? count = 0 : count;
count++;
tokens.put(t, count);
}
ts.end();
ts.close();
assertEquals(7, tokens.size());
assertEquals(new Integer(1), tokens.get("林斯"));
}
private String generateString() {
Random r = new Random();
int len = r.nextInt(1000);
int uniqueVocabTerms = 10000;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len; i++) {
sb.append(Integer.toString(r.nextInt(uniqueVocabTerms)+100000));
sb.append(" ");
}
return sb.toString();
}
}