/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import opennlp.tools.formats.ConllXPOSSampleStream;
import opennlp.tools.postag.POSEvaluator;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
/**
* Evaluates the POS Tagger on the CONLL-X data. The CONLL-X data includes training and evaluation data for
* Danish, Dutch, Portuguese and Swedish.
* <p>
* The following files are needed in the data directory to run this test:
* conllx/data/danish/ddt/train/danish_ddt_train.conll<br>
* conllx/data/danish/ddt/test/danish_ddt_test.conll<br>
* conllx/data/dutch/alpino/train/dutch_alpino_train.conll<br>
* conllx/data/dutch/alpino/test/dutch_alpino_test.conll<br>
* conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll<br>
* conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll<br>
* conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll<br>
* conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll<br>
* <p>
* The structure follows the structure of the CONLL-X data distribution. There is
* one package for each language, and an extra package containing the tests for all
* languages.
*/
public class ConllXPosTaggerEval {
private static POSModel train(File trainFile, String lang,
TrainingParameters params) throws IOException {
ObjectStream<POSSample> samples =
new ConllXPOSSampleStream(new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8);
return POSTaggerME.train(lang, samples, params, new POSTaggerFactory());
}
private static void eval(POSModel model, File testData,
double expectedAccuracy) throws IOException {
ObjectStream<POSSample> samples = new ConllXPOSSampleStream(
new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8);
POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model));
evaluator.evaluate(samples);
Assert.assertEquals(expectedAccuracy, evaluator.getWordAccuracy(), 0.0001);
}
@Test
public void evalDanishMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/danish/ddt/train/danish_ddt_train.conll"), "da", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/danish/ddt/test/danish_ddt_test.conll"), 0.9504442925495558d);
}
@Test
public void evalDanishMaxentQn() throws IOException {
TrainingParameters params = EvalUtil.createMaxentQnParams();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/danish/ddt/train/danish_ddt_train.conll"), "da", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/danish/ddt/test/danish_ddt_test.conll"), 0.9564251537935748d);
}
@Test
public void evalDutchMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/dutch/alpino/train/dutch_alpino_train.conll"), "nl", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/dutch/alpino/test/dutch_alpino_test.conll"), 0.9213965980304387d);
}
@Test
@Ignore
public void evalDutchMaxentQn() throws IOException {
TrainingParameters params = EvalUtil.createMaxentQnParams();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/dutch/alpino/train/dutch_alpino_train.conll"), "nl", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/dutch/alpino/test/dutch_alpino_test.conll"), 0.9282005371530886d);
}
@Test
public void evalPortugueseMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll"), "pt", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll"), 0.9671041418101244d);
}
@Test
public void evalPortugueseMaxentQn() throws IOException {
TrainingParameters params = EvalUtil.createMaxentQnParams();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll"), "pt", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/portuguese/bosque/test/portuguese_bosque_test.conll"), 0.9662519175046872d);
}
@Test
public void evalSwedishMaxentGis() throws IOException {
TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll"), "se", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll"), 0.9248585572842999d);
}
@Test
public void evalSwedishMaxentQn() throws IOException {
TrainingParameters params = EvalUtil.createMaxentQnParams();
POSModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll"), "se", params);
eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(),
"conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll"), 0.9322842998585573d);
}
}