ArvoresDeitadasEval.java example

Explorer
opennlp-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.eval;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import org.junit.Assert;
import org.junit.Test;

import opennlp.tools.chunker.ChunkerCrossValidator;
import opennlp.tools.chunker.ChunkerFactory;
import opennlp.tools.formats.ad.ADChunkSampleStream;
import opennlp.tools.formats.ad.ADNameSampleStream;
import opennlp.tools.formats.ad.ADSentenceSampleStream;
import opennlp.tools.formats.convert.NameToTokenSampleStream;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.sentdetect.SDCrossValidator;
import opennlp.tools.sentdetect.SentenceDetectorFactory;
import opennlp.tools.sentdetect.lang.Factory;
import opennlp.tools.tokenize.DetokenizationDictionary;
import opennlp.tools.tokenize.DictionaryDetokenizer;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenizerCrossValidator;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;

/**
 * Cross validation of Sentence Detector, Tokenizer and Chunker against the
 * Portugues corpus.
 * <p>
 * Download the gz files from the Floresta Sintactica project <a
 * href="http://www.linguateca.pt/floresta/corpus.html"> site </a> and
 * decompress it into this directory: $OPENNLP_DATA_DIR/ad.
 * <ul>
 * <li><a href=
 * "http://www.linguateca.pt/floresta/ficheiros/gz/FlorestaVirgem_CF_3.0_ad.txt.gz"
 * > FlorestaVirgem_CF_3.0_ad.txt.gz </a></li>
 * <li><a href=
 * "http://www.linguateca.pt/floresta/ficheiros/gz/Bosque_CF_8.0.ad.txt.gz">
 * Bosque_CF_8.0.ad.txt.gz </a></li>
 * </ul>
 */
public class ArvoresDeitadasEval {

  private static final String BOSQUE = "ad/Bosque_CF_8.0.ad.txt";
  private static final String FLORESTA_VIRGEM = "ad/FlorestaVirgem_CF_3.0_ad.txt";

  private static final String ENCODING = "ISO-8859-1";

  private static final String LANG = "pt";

  private static ObjectStream<String> getLineSample(String corpus)
      throws IOException {
    return new PlainTextByLineStream(new MarkableFileInputStreamFactory(
        new File(EvalUtil.getOpennlpDataDir(), corpus)), ENCODING);
  }

  private static void sentenceCrossEval(TrainingParameters params,
                                        double expectedScore) throws IOException {

    ADSentenceSampleStream samples = new ADSentenceSampleStream(
        getLineSample(FLORESTA_VIRGEM), false);

    SDCrossValidator cv = new SDCrossValidator(LANG, params,
        new SentenceDetectorFactory(LANG, true, null,
            new Factory().getEOSCharacters(LANG)));

    cv.evaluate(samples, 10);

    System.out.println(cv.getFMeasure());
    Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.0001d);
  }

  private static void tokenizerCrossEval(TrainingParameters params,
                                         double expectedScore) throws IOException {

    ObjectStream<NameSample> nameSamples = new ADNameSampleStream(
        getLineSample(FLORESTA_VIRGEM), true);

    DictionaryDetokenizer detokenizer = new DictionaryDetokenizer(
        new DetokenizationDictionary(new FileInputStream(new File(
            "lang/pt/tokenizer/pt-detokenizer.xml"))));

    ObjectStream<TokenSample> samples = new NameToTokenSampleStream(
        detokenizer, nameSamples);

    TokenizerCrossValidator validator;

    TokenizerFactory tokFactory = TokenizerFactory.create(null, LANG, null,
        true, null);
    validator = new opennlp.tools.tokenize.TokenizerCrossValidator(params,
        tokFactory);

    validator.evaluate(samples, 10);

    System.out.println(validator.getFMeasure());
    Assert.assertEquals(expectedScore, validator.getFMeasure().getFMeasure(),
        0.0001d);
  }

  private static void chunkerCrossEval(TrainingParameters params,
                                       double expectedScore) throws IOException {

    ADChunkSampleStream samples = new ADChunkSampleStream(getLineSample(BOSQUE));

    ChunkerCrossValidator cv = new ChunkerCrossValidator(LANG, params,
        new ChunkerFactory());

    cv.evaluate(samples, 10);
    Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.0001d);
  }

  @Test
  public void evalPortugueseSentenceDetectorPerceptron() throws IOException {
    sentenceCrossEval(EvalUtil.createPerceptronParams(), 0.9892778840089301d);
  }

  @Test
  public void evalPortugueseSentenceDetectorGis() throws IOException {
    sentenceCrossEval(ModelUtil.createDefaultTrainingParameters(), 0.987270070655111d);
  }

  @Test
  public void evalPortugueseSentenceDetectorMaxentQn() throws IOException {
    sentenceCrossEval(EvalUtil.createMaxentQnParams(), 0.99261110833375d);
  }

  @Test
  public void evalPortugueseSentenceDetectorNaiveBayes() throws IOException {
    sentenceCrossEval(EvalUtil.createNaiveBayesParams(), 0.9672196206048099d);
  }

  @Test
  public void evalPortugueseTokenizerPerceptron() throws IOException {
    tokenizerCrossEval(EvalUtil.createPerceptronParams(), 0.9994887308380267d);
  }

  @Test
  public void evalPortugueseTokenizerGis() throws IOException {
    tokenizerCrossEval(ModelUtil.createDefaultTrainingParameters(), 0.9992539405481062d);
  }

  @Test
  public void evalPortugueseTokenizerMaxentQn() throws IOException {
    tokenizerCrossEval(EvalUtil.createMaxentQnParams(), 0.9996017148748251d);
  }

  @Test
  public void evalPortugueseTokenizerNaiveBayes() throws IOException {
    tokenizerCrossEval(EvalUtil.createNaiveBayesParams(), 0.9962358244502717d);
  }
  @Test
  public void evalPortugueseTokenizerMaxentQnMultipleThreads() throws IOException {
    TrainingParameters params = EvalUtil.createMaxentQnParams();
    params.put("Threads", 4);
    tokenizerCrossEval(params, 0.9996017148748251d);
  }

  @Test
  public void evalPortugueseChunkerPerceptron() throws IOException {
    chunkerCrossEval(EvalUtil.createPerceptronParams(),
        0.9638122825015589d);
  }

  @Test
  public void evalPortugueseChunkerGis() throws IOException {
    chunkerCrossEval(ModelUtil.createDefaultTrainingParameters(),
        0.9573860781121228d);
  }

  @Test
  public void evalPortugueseChunkerGisMultipleThreads() throws IOException {
    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
    params.put("Threads", 4);
    chunkerCrossEval(params, 0.9573860781121228d);
  }

  @Test
  public void evalPortugueseChunkerQn() throws IOException {
    chunkerCrossEval(EvalUtil.createMaxentQnParams(),
        0.9652111035230788d);
  }

  @Test
  public void evalPortugueseChunkerQnMultipleThreads() throws IOException {
    TrainingParameters params = EvalUtil.createMaxentQnParams();
    params.put("Threads", 4);

    // NOTE: Should be the same as without multiple threads!!!
    chunkerCrossEval(params, 0.9647304571382662);
  }

  @Test
  public void evalPortugueseChunkerNaiveBayes() throws IOException {
    chunkerCrossEval(EvalUtil.createNaiveBayesParams(), 0.9041507736043933d);
  }
}