/*
* Copyright 2012 Takao Nakaguchi
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.trie4j.bytes;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import org.trie4j.bytes.Node;
import org.trie4j.bytes.PatriciaTrie;
import org.trie4j.bytes.TrieVisitor;
import org.trie4j.util.CharsetUtil;
import org.trie4j.util.StringUtil;
public class TestWikipedia {
private static final int maxCount = 2000000;
public static void main(String[] args) throws Exception{
System.out.println("--- recursive patricia trie ---");
PatriciaTrie trie = new org.trie4j.bytes.PatriciaTrie();
int c = 0;
// You can download archive from http://dumps.wikimedia.org/jawiki/latest/
BufferedReader r = new BufferedReader(new InputStreamReader(
// new GZIPInputStream(new FileInputStream("jawiki-20120220-all-titles-in-ns0.gz"))
new GZIPInputStream(new FileInputStream("enwiki-20120403-all-titles-in-ns0.gz"))
, CharsetUtil.newUTF8Decoder()));
String word = null;
System.gc();
Thread.sleep(1000);
System.out.println(Runtime.getRuntime().freeMemory() + " bytes free.");
long sum = 0;
long lap = System.currentTimeMillis();
int charCount = 0;
while((word = r.readLine()) != null){
byte[] bytes = word.getBytes("UTF-8");
long d = System.currentTimeMillis();
trie.insert(bytes);
sum += System.currentTimeMillis() - d;
charCount += word.length();
if(c % 100000 == 0){
d = System.currentTimeMillis() - lap;
long free = Runtime.getRuntime().freeMemory();
System.out.println(
c + "," + free + "," + Runtime.getRuntime().maxMemory() + "," + d
);
lap = System.currentTimeMillis();
}
c++;
if(c == maxCount) break;
}
System.out.println(c + "entries in ja wikipedia titles.");
System.out.println("insert time: " + sum + " millis.");
System.out.println("-- insert done.");
System.gc();
Thread.sleep(1000);
System.out.println(Runtime.getRuntime().freeMemory() + " bytes free.");
investigate(trie, charCount);
/*
// dump(trie);
System.out.println("-- pack");
lap = System.currentTimeMillis();
if(trie instanceof MultilayerPatriciaTrie){
MultilayerPatriciaTrie mt = (MultilayerPatriciaTrie)trie;
mt.pack();
System.out.println("-- pack done in " + (System.currentTimeMillis() - lap) + " millis.");
// dump(trie);
System.gc();
Thread.sleep(1000);
System.out.println(Runtime.getRuntime().freeMemory() + " bytes free.");
investigate(mt, charCount);
}
//*/
}
private static void dump(PatriciaTrie trie){
System.out.println("--dump--");
trie.visit(new TrieVisitor() {
@Override
public void accept(Node node, int nest) {
for(int i = 0; i < nest; i++){
System.out.print(" ");
}
byte[] letters = node.getLetters();
if(letters != null && letters.length > 0){
System.out.print(StringUtil.fromUTF8(letters));
}
if(node.isTerminate()){
System.out.print("*");
}
System.out.println();
}
});
}
private static void investigate(PatriciaTrie trie, int charCount)
throws Exception{
System.out.println("-- count elements.");
final AtomicInteger count = new AtomicInteger();
trie.visit(new TrieVisitor() {
public void accept(Node node, int nest) {
if(node.isTerminate()) count.incrementAndGet();
}
});
System.out.println(count.intValue() + " elements.");
//*
System.out.println("-- list elements.");
final AtomicInteger n = new AtomicInteger();
final AtomicInteger l = new AtomicInteger();
final AtomicInteger ln = new AtomicInteger();
final AtomicInteger chars = new AtomicInteger();
trie.visit(new TrieVisitor() {
public void accept(Node node, int nest) {
if(node.isTerminate()){
l.incrementAndGet();
} else{
n.incrementAndGet();
}
chars.addAndGet(node.getLetters().length);
}
});
System.out.println("node: " + n.intValue());
System.out.println("leaf: " + l.intValue());
System.out.println("label node: " + ln.intValue());
System.out.println("total char count: " + charCount);
System.out.println("total char count in trie: " + chars.intValue());
System.out.println("verifying trie...");
BufferedReader r = new BufferedReader(new InputStreamReader(
// new GZIPInputStream(new FileInputStream("jawiki-20120220-all-titles-in-ns0.gz"))
new GZIPInputStream(new FileInputStream("enwiki-20120403-all-titles-in-ns0.gz"))
, CharsetUtil.newUTF8Decoder()));
long lap = System.currentTimeMillis();
int c = 0;
int sum = 0;
String word = null;
while((word = r.readLine()) != null){
if(c == maxCount) break;
long d = System.currentTimeMillis();
boolean found = trie.contains(word);
sum += System.currentTimeMillis() - d;
if(!found){
System.out.println("trie not contains [" + word + "]");
break;
}
if(c % 100000 == 0){
System.out.println(c + " elements done.");
}
c++;
}
System.out.println("done in " + (System.currentTimeMillis() - lap) + " millis.");
System.out.println("contains time: " + sum + " millis.");
System.out.println(trie.getRoot().getChildren().length + "children in root");
final PatriciaTrie t = trie;
new Thread(new Runnable() {
@Override
public void run() {
try {
Thread.sleep(100000);
t.contains("hello");
} catch (InterruptedException e) {
}
}
}).start();
//*/
}
}