//
// Copyright 2010 Cinch Logic Pty Ltd.
//
// http://www.chililog.com
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package org.chililog.server.common;
import static org.junit.Assert.*;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
import org.chililog.server.common.Log4JLogger;
import org.chililog.server.common.TextTokenizer;
import org.junit.Test;
public class TextTokenizerTest {
private static Log4JLogger _logger = Log4JLogger.getLogger(TextTokenizerTest.class);
@Test
public void testBasic() throws IOException {
List<String> text = TextTokenizer.getInstance().tokenize("Hello, I am Jim.", -1);
_logger.info(text.toString());
assertEquals(4, text.size());
assertEquals("[hello, i, am, jim]", text.toString());
// Apache logs http://httpd.apache.org/docs/1.3/logs.html
List<String> apacheError = TextTokenizer
.getInstance()
.tokenize(
"[Wed Oct 11 14:32:52 2000] [error] [client 127.0.0.1] client denied by server configuration: /export/home/live/ap/htdocs/test",
-1);
_logger.info(apacheError.toString());
List<String> commonLogFormat = TextTokenizer.getInstance().tokenize(
"127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] \"GET /apache_pb.gif HTTP/1.0\" 200 2326", -1);
_logger.info(commonLogFormat.toString());
List<String> combinedLogFormat = TextTokenizer
.getInstance()
.tokenize(
"127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] \"GET /apache_pb.gif HTTP/1.0\" 200 2326 \"http://www.example.com/start.html\" \"Mozilla/4.08 [en] (Win98; I ;Nav)\"",
-1);
_logger.info(combinedLogFormat.toString());
// Email and file path
List<String> emails = TextTokenizer
.getInstance()
.tokenize(
"vibul@testing.com.au is the email address to parse. C:\\folder1\\folder2\\vvv.java. /tmp/test/vvv.java",
-1);
_logger.info(emails.toString());
assertEquals("[vibul, testing, com, au, email, address, parse, c, folder1, folder2, vvv, java, tmp, test]",
emails.toString());
// xml
List<String> xml = TextTokenizer.getInstance().tokenize("<hello><afield>b</afield></hello>", -1);
_logger.info(xml.toString());
// json
List<String> json = TextTokenizer.getInstance().tokenize(
"{ name: \"chililog\", display_name: \"ChiliLog Log\", "
+ "description: \"Log repository for ChiliLog events\", startup_status: 'ONLINE'}", -1);
_logger.info(json.toString());
// stack trace
List<String> stackTrace = TextTokenizer
.getInstance()
.tokenize(
"2011-03-26 15:32:22,376 [main] ERROR com.chililog.server.common.ChiliLogExceptionTest - "
+ "com.chililog.server.common.ChiliLogException: Test12\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:69)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)\n"
+ "at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)\n"
+ "at java.lang.reflect.Method.invoke(Method.java:597)\n"
+ "at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44)\n"
+ "at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15)\n"
+ "at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41)\n"
+ "at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runNotIgnored(BlockJUnit4ClassRunner.java:79)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:71)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:49)\n"
+ "at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193)\n"
+ "at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52)\n"
+ "at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191)\n"
+ "at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42)\n"
+ "at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184)\n"
+ "at org.junit.runners.ParentRunner.run(ParentRunner.java:236)\n"
+ "at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:49)\n"
+ "at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)\n"
+ "Caused by: java.lang.NullPointerException: inner exception\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:63)\n"
+ "... 23 more", -1);
_logger.info(stackTrace.toString());
return;
}
/**
* On my machine, brute force search is better than hash if there are 8 item or less. Since most text will have more
* than 8 terms, we will use hash map.
*
* @throws IOException
*/
@Test
public void testBenchmarkHashMap() throws IOException {
ArrayList<String> l = new ArrayList<String>();
l.add("the");
l.add("quick");
l.add("brown");
l.add("fox");
l.add("jumped");
l.add("over");
l.add("lazy");
l.add("dog");
// l.add("this");
// l.add("is");
// l.add("testing");
// l.add("which");
// l.add("search");
// l.add("method");
// l.add("faster");
HashMap<String, String> m = new HashMap<String, String>();
for (String s : l) {
m.put(s, s);
}
Date startTime = new Date();
for (int i = 0; i < 1000000; i++) {
l.contains("faster");
}
Date endTime = new Date();
_logger.info("Array search: %s", endTime.getTime() - startTime.getTime());
startTime = new Date();
for (int i = 0; i < 1000000; i++) {
m.containsKey("faster");
}
endTime = new Date();
_logger.info("Hash search: %s", endTime.getTime() - startTime.getTime());
}
/**
* Regular expression is slow
*
* @throws IOException
*/
@Test
public void testBenchmarkRegex() throws IOException {
Date startTime = new Date();
for (int i = 0; i < 10000; i++) {
basicTokenize("2011-03-26 15:32:22,376 [main] ERROR com.chililog.server.common.ChiliLogExceptionTest - "
+ "com.chililog.server.common.ChiliLogException: Test12\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:69)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)\n"
+ "at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)\n"
+ "at java.lang.reflect.Method.invoke(Method.java:597)\n"
+ "at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44)\n"
+ "at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15)\n"
+ "at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41)\n"
+ "at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runNotIgnored(BlockJUnit4ClassRunner.java:79)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:71)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:49)\n"
+ "at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193)\n"
+ "at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52)\n"
+ "at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191)\n"
+ "at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42)\n"
+ "at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184)\n"
+ "at org.junit.runners.ParentRunner.run(ParentRunner.java:236)\n"
+ "at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:49)\n"
+ "at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)\n"
+ "Caused by: java.lang.NullPointerException: inner exception\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:63)\n"
+ "... 23 more");
}
Date endTime = new Date();
_logger.info("No special parsing search: %s", endTime.getTime() - startTime.getTime());
startTime = new Date();
for (int i = 0; i < 10000; i++) {
TextTokenizer
.getInstance()
.tokenize(
"2011-03-26 15:32:22,376 [main] ERROR com.chililog.server.common.ChiliLogExceptionTest - "
+ "com.chililog.server.common.ChiliLogException: Test12\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:69)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)\n"
+ "at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)\n"
+ "at java.lang.reflect.Method.invoke(Method.java:597)\n"
+ "at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44)\n"
+ "at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15)\n"
+ "at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41)\n"
+ "at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runNotIgnored(BlockJUnit4ClassRunner.java:79)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:71)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:49)\n"
+ "at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193)\n"
+ "at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52)\n"
+ "at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191)\n"
+ "at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42)\n"
+ "at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184)\n"
+ "at org.junit.runners.ParentRunner.run(ParentRunner.java:236)\n"
+ "at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:49)\n"
+ "at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)\n"
+ "Caused by: java.lang.NullPointerException: inner exception\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:63)\n"
+ "... 23 more", -1);
}
endTime = new Date();
_logger.info("Hardcoded parsing search: %s", endTime.getTime() - startTime.getTime());
/**
* Thanks to http://fightingforalostcause.net/misc/2006/compare-email-regex.php
*/
Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("^(.)+@[.]+$");
Pattern CLASS_NAME_PATTERN = Pattern.compile("^(\\D.*)\\.(\\D.*)*$");
startTime = new Date();
for (int i = 0; i < 10000; i++) {
List<String> l = basicTokenize("2011-03-26 15:32:22,376 [main] ERROR com.chililog.server.common.ChiliLogExceptionTest - "
+ "com.chililog.server.common.ChiliLogException: Test12\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:69)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n"
+ "at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)\n"
+ "at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)\n"
+ "at java.lang.reflect.Method.invoke(Method.java:597)\n"
+ "at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:44)\n"
+ "at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:15)\n"
+ "at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:41)\n"
+ "at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:20)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runNotIgnored(BlockJUnit4ClassRunner.java:79)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:71)\n"
+ "at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:49)\n"
+ "at org.junit.runners.ParentRunner$3.run(ParentRunner.java:193)\n"
+ "at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:52)\n"
+ "at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:191)\n"
+ "at org.junit.runners.ParentRunner.access$000(ParentRunner.java:42)\n"
+ "at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:184)\n"
+ "at org.junit.runners.ParentRunner.run(ParentRunner.java:236)\n"
+ "at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:49)\n"
+ "at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)\n"
+ "at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)\n"
+ "Caused by: java.lang.NullPointerException: inner exception\n"
+ "at com.chililog.server.common.ChiliLogExceptionTest.testWrapping(ChiliLogExceptionTest.java:63)\n"
+ "... 23 more");
for (String term : l) {
CLASS_NAME_PATTERN.matcher(term).matches();
EMAIL_ADDRESS_PATTERN.matcher(term).matches();
CLASS_NAME_PATTERN.matcher(term).matches();
EMAIL_ADDRESS_PATTERN.matcher(term).matches();
}
}
endTime = new Date();
_logger.info("With Regex search: %s", endTime.getTime() - startTime.getTime());
}
/**
* Used for benchmarking ... basic tokenizing without regular expression
*
* @param text
* @return
* @throws IOException
*/
public List<String> basicTokenize(String text) throws IOException {
List<String> tokens = new ArrayList<String>();
if (StringUtils.isEmpty(text)) {
return tokens;
}
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
HashMap<String, String> lookup = new HashMap<String, String>();
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
while (stream.incrementToken()) {
String term = termAttribute.term();
if (!lookup.containsKey(term)) {
tokens.add(term);
lookup.put(term, null);
}
}
return tokens;
}
}