/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.eval; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import org.junit.Assert; import org.junit.Test; import opennlp.tools.chunker.ChunkSample; import opennlp.tools.chunker.ChunkSampleStream; import opennlp.tools.chunker.ChunkerEvaluator; import opennlp.tools.chunker.ChunkerFactory; import opennlp.tools.chunker.ChunkerME; import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; import opennlp.tools.util.model.ModelUtil; /** * Evaluates the chunker against the English CONLL2000 corpus. * <p> * Download the train and eval gz files from the CONLL2000 shared task * <a href="http://www.cnts.ua.ac.be/conll2000/chunking/"> site </a> * and decompress them into this directory: $OPENNLP_DATA_DIR/conll00. */ public class Conll00ChunkerEval { private static ChunkerModel train(File trainFile, TrainingParameters params) throws IOException { ObjectStream<ChunkSample> samples = new ChunkSampleStream( new PlainTextByLineStream( new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8)); return ChunkerME.train("en", samples, params, new ChunkerFactory()); } private static void eval(ChunkerModel model, File testData, double expectedFMeasure) throws IOException { ObjectStream<ChunkSample> samples = new ChunkSampleStream( new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8)); ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model)); evaluator.evaluate(samples); Assert.assertEquals(expectedFMeasure, evaluator.getFMeasure().getFMeasure(), 0.0001); } @Test public void evalEnglishPerceptron() throws IOException { ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(), "conll00/train.txt"), EvalUtil.createPerceptronParams()); eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"), 0.9295018353434714d); } @Test public void evalEnglishMaxentGis() throws IOException { ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(), "conll00/train.txt"), ModelUtil.createDefaultTrainingParameters()); eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"), 0.9239687473746113d); } // Note: Don't try to run this on your MacBook @Test public void evalEnglishMaxentQn() throws IOException { TrainingParameters params = EvalUtil.createMaxentQnParams(); params.put("Threads", 4); ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(), "conll00/train.txt"), params); eval(maxentModel, new File(EvalUtil.getOpennlpDataDir(), "conll00/test.txt"), 0.9302599230947028d); } }