// Copyright 2015 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.test.tokenizer.opennlp;
import static org.junit.Assert.*;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import marmot.tokenize.RuleBasedTokenizer;
import marmot.tokenize.Tokenizer;
import marmot.tokenize.openlp.OpenNlpTokenizerTrainer;
import marmot.tokenize.rules.RuleProvider;
import org.junit.Test;
public class RuleBasedTokenizerTest {
private Tokenizer tokenizer_;
private BufferedReader examples_;
private String getResourceFile(String name) {
String source = "src"+File.separatorChar;
Package pack = getClass().getPackage();
String path = pack.getName().replace('.', File.separatorChar)+File.separatorChar;
return String.format("%s%s%s", source, path, name);
}
public RuleBasedTokenizerTest(){
OpenNlpTokenizerTrainer trainer = new OpenNlpTokenizerTrainer();
Tokenizer tokenizer;
try {
System.out.println(getResourceFile(""));
examples_ = new BufferedReader(new FileReader(getResourceFile("RBTT_examples.txt")));
tokenizer = trainer.train("data/es/open_nlp_style.txt");
} catch (IOException e) {
throw new RuntimeException(e);
}
RuleProvider provider = RuleProvider.createRuleProvider("es");
tokenizer_ = new RuleBasedTokenizer(tokenizer, provider);
}
@Test
public void testTokenizer(){
try {
String untok_1 = examples_.readLine();
List<String> result_1 = new ArrayList<String>();
for(String token : examples_.readLine().split(" ")) {
result_1.add(token);
}
List<String> prediction_1 = tokenizer_.tokenize(untok_1);
assertEquals(prediction_1, result_1);
String untok_2 = examples_.readLine();
List<String> result_2 = new ArrayList<String>();
for(String token : examples_.readLine().split(" ")) {
result_2.add(token);
}
List<String> prediction_2 = tokenizer_.tokenize(untok_2);
assertEquals(prediction_2, result_2);
String untok_3 = examples_.readLine();
List<String> result_3 = new ArrayList<String>();
for(String token : examples_.readLine().split(" ")) {
result_3.add(token);
}
List<String> prediction_3 = tokenizer_.tokenize(untok_3);
assertEquals(prediction_3, result_3);
String untok_4 = examples_.readLine();
List<String> result_4 = new ArrayList<String>();
for(String token : examples_.readLine().split(" ")) {
result_4.add(token);
}
List<String> prediction_4 = tokenizer_.tokenize(untok_4);
assertEquals(prediction_4, result_4);
examples_.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}