package org.apache.lucene.analysis.core; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; import java.net.URL; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttributeImpl; import org.fastcatsearch.ir.common.IRException; public class TokenizerTestBase { public void testTokenizerSpeed(Tokenizer tokenizer, boolean isDebug) throws IRException { URL url = getClass().getResource("/org/apache/lucene/analysis/korean_1000_text.txt"); File file = new File(url.getFile()); testTokenizer(tokenizer, file, isDebug); } public void testTokenizer(Tokenizer tokenizer, File file, boolean isDebug) throws IRException { int i = 0; long start = System.currentTimeMillis(); long lap = start; int COUNT = 50; for (int k = 0; k < COUNT; k++) { InputStream is = null; try { is = new FileInputStream(file); BufferedReader dr = new BufferedReader(new InputStreamReader(is, "utf-8")); String line = null; CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = null; try{ typeAtt = tokenizer.getAttribute(TypeAttribute.class); }catch(Exception e){ typeAtt = new TypeAttributeImpl(); } while ((line = dr.readLine()) != null) { line = line.trim(); if (line.length() == 0) continue; tokenizer.setReader(new StringReader(line)); tokenizer.reset(); if (isDebug) { System.out.println(">>>>" + line); } while (tokenizer.incrementToken()) { if (isDebug) { String str = termAttribute.toString(); System.out.println(str + " " + typeAtt.type() + " [ " + offsetAtt.startOffset() + " ~ " + offsetAtt.endOffset() + " ]"); } } i++; if ((i % 10000) == 0) { System.out.println(i + " th " + (System.currentTimeMillis() - lap) + "ms"); lap = System.currentTimeMillis(); } } } catch (Exception ignore) { ignore.printStackTrace(); return; } finally { if (is != null) { try { is.close(); } catch (IOException ignore) { } } } } long DTime = System.currentTimeMillis() - start; double lps = i / DTime * 1000; System.out.println("DONE " + i + " lines time = " + DTime + "ms, lps=" + lps); double mbps = file.length() * COUNT / (DTime / 1000.0) / 1024 / 1024; System.out.println("LineByLine index Extraction Speed : " + mbps + "MBp/s"); } }