BuildLexicalizedParserITest.java example

Explorer
CoreNLP-master
// BuildLexicalizedParserITest
// Copyright (c) 2002-2010 Leland Stanford Junior University

//This program is free software; you can redistribute it and/or
//modify it under the terms of the GNU General Public License
//as published by the Free Software Foundation; either version 2
//of the License, or (at your option) any later version.

//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.

//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

//For more information, bug reports, fixes, contact:
//Christopher Manning
//Dept of Computer Science, Gates 1A
//Stanford CA 94305-9010
//USA
//Support/Questions: java-nlp-user@lists.stanford.edu
//Licensing: java-nlp-support@lists.stanford.edu
//http://www-nlp.stanford.edu/software/tagger.shtml


package edu.stanford.nlp.parser.lexparser;

import junit.framework.TestCase;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Formatter;

import edu.stanford.nlp.io.TeeStream;
import edu.stanford.nlp.util.StringUtils;

/**
 * This builds and tests several English parsers on one or a few input
 * trees each.  The goal is to make sure there are no crashes when
 * executing the normally used training paths.
 *
 * @author John Bauer
 */
public class BuildLexicalizedParserITest extends TestCase {


  // This is the example command line run by the
  // makeSerializedParser.csh script:
  //
  //String commandLine = "-evals \"factDA,tsv\" -goodPCFG -saveToSerializedFile wsjPCFG.ser.gz -saveToTextFile wsjPCFG.txt -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219";

  // We build a similar command line using temporary files for the
  // parser and the output files
  public static final String[] englishCommandLines = {"-evals factDA,tsv -goodPCFG -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -testTreebank %s",
                                                      "-evals factDA,tsv -goodPCFG -noTagSplit -saveToSerializedFile %s -saveToTextFile %s -compactGrammar 0 -maxLength 40 -train %s -testTreebank %s",
                                                      "-evals factDA,tsv -ijcai03 -v -printStates -compactGrammar 0 -correctTags -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -testTreebank %s",
                                                      "-evals factDA,tsv -ijcai03 -v -printStates -compactGrammar 0 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -testTreebank %s"};

  public static final String[] englishTwoTreebanks = {"-evals factDA,tsv -ijcai03 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -train2 %s 0-9 0.5 -testTreebank %s",
                                                      "-evals factDA,tsv -goodPCFG -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -train2 %s 0-9 0.5 -testTreebank %s"};

  public static final String[] chineseCommandLines = {"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -encoding utf-8 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
                                                      "-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -acl03chinese -encoding utf-8 -scTags -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
                                                      "-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 -chineseFactored -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
                                                      "-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -encoding utf-8 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
                                                      "-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 -segmentMarkov -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s -sctags -acl03chinese"};

  public static final String[] germanCommandLines = {"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -encoding UTF-8 -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s",
                                                     "-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -encoding UTF-8 -PCFG -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 1 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s"};

  public static final String[] frenchCommandLines = {"-evals factDA,tsv -maxLength 40 -tLPP edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams -encoding UTF-8 -frenchFactored -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s"};

  public static final String[] arabicCommandLines = {"-evals factDA,tsv -maxLength 40 -tLPP edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams -encoding UTF-8 -arabicFactored -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s"};

  // TODO: weird: this parser is not saved anywhere
  //LexicalizedParser invoked with arguments: -evals factDA,tsv -acl03pcfg -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219

  public static final String baseTestSerCommandLine = "-encoding utf-8 -loadFromSerializedFile %s -testTreebank %s 0-1";
  public static final String baseTestTextCommandLine = "-encoding utf-8 -loadFromTextFile %s -testTreebank %s 0-1";
  // public static final String PERF_EVAL = "factor LP/LR summary evalb: LP: ";


  public static final String englishOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/en-onetree.txt";
  public static final String englishSecondTree = "projects/core/data/edu/stanford/nlp/parser/trees/en-secondtree.txt";
  public static final String englishThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/en-threetrees.txt";

  public static final String chineseOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/zh-onetree.txt";
  public static final String chineseThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/zh-threetrees.txt";

  public static final String germanOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/de-onesent.txt";
  public static final String germanThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/de-threesents.txt";

  public static final String frenchOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/fr-onetree.txt";
  public static final String frenchThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/fr-threetrees.txt";

  public static final String arabicOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/ar-onetree.txt";
  public static final String arabicThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/ar-threetrees.txt";

  public static class ParserTestCase {
    public final String[] trainCommandLine;
    public final String testPath;
    public final File parserFile;
    public final File textFile;

    ParserTestCase(String[] trainCommandLine, String testPath,
                   File parserFile, File textFile) {
      this.trainCommandLine = trainCommandLine;
      this.testPath = testPath;
      this.parserFile = parserFile;
      this.textFile = textFile;
    }

    public static ParserTestCase
      buildOneTreebankTestCase(String baseCommandLine,
                               String trainPath,
                               String testPath)
      throws IOException
    {
      File parserFile = File.createTempFile("parser", ".ser.gz");
      File textFile = File.createTempFile("parser", ".txt");

      Formatter commandLineFormatter = new Formatter();
      // Note that we test on the train path.  The goal is we should
      // get 100% accuracy if everything worked.  We will test on
      // unknown trees in the next step.
      commandLineFormatter.format(baseCommandLine, parserFile.getPath(),
                                  textFile.getPath(), trainPath, trainPath);
      String[] trainCommandLine =
        commandLineFormatter.toString().split("\\s+");

      ParserTestCase test = new ParserTestCase(trainCommandLine, testPath,
                                               parserFile, textFile);
      return test;
    }

    public static ParserTestCase
      buildTwoTreebankTestCase(String baseCommandLine,
                               String trainPath,
                               String secondaryPath,
                               String testPath)
      throws IOException
    {
      File parserFile = File.createTempFile("parser", ".ser.gz");
      File textFile = File.createTempFile("parser", ".txt");

      Formatter commandLineFormatter = new Formatter();
      // Note that we test on the train path.  The goal is we should
      // get 100% accuracy if everything worked.  We will test on
      // unknown trees in the next step.
      commandLineFormatter.format(baseCommandLine, parserFile.getPath(),
                                  textFile.getPath(), trainPath,
                                  secondaryPath, trainPath);
      String[] trainCommandLine =
        commandLineFormatter.toString().split("\\s+");

      ParserTestCase test = new ParserTestCase(trainCommandLine, testPath,
                                               parserFile, textFile);
      return test;
    }
  }

  static public void buildAndTest(ParserTestCase test)
    throws IOException
  {
    PrintStream originalOut = System.out;
    PrintStream originalErr = System.err;

    System.out.println("Training:");
    System.out.println(StringUtils.join(test.trainCommandLine));

    ByteArrayOutputStream savedOutput = new ByteArrayOutputStream();
    TeeStream teeOut = new TeeStream(savedOutput, System.out);
    PrintStream teeOutPS = new PrintStream(teeOut);
    TeeStream teeErr = new TeeStream(savedOutput, System.err);
    PrintStream teeErrPS = new PrintStream(teeErr);
    System.setOut(teeOutPS);
    System.setErr(teeErrPS);

    LexicalizedParser.main(test.trainCommandLine);

    teeOutPS.flush();
    teeErrPS.flush();
    teeOut.flush();
    teeErr.flush();

    String[] outputLines =
      savedOutput.toString().split("(?:\\n|\\r)+");
    String perfLine = outputLines[outputLines.length - 5];
    System.out.println(perfLine);
    assertEquals("factor LP/LR summary evalb: LP: 100.0 LR: 100.0 F1: 100.0 Exact: 100.0 N: 1", perfLine.trim());

    Formatter commandLineFormatter = new Formatter();
    commandLineFormatter.format(baseTestSerCommandLine,
                                test.parserFile.getPath(), test.testPath);
    String[] testCommandLine =
      commandLineFormatter.toString().split("\\s");

    System.out.println("Testing:");
    System.out.println(StringUtils.join(testCommandLine));

    LexicalizedParser.main(testCommandLine);

    commandLineFormatter = new Formatter();
    commandLineFormatter.format(baseTestTextCommandLine,
                                test.textFile.getPath(), test.testPath);
    testCommandLine = commandLineFormatter.toString().split("\\s");

    System.out.println("Testing:");
    System.out.println(StringUtils.join(testCommandLine));

    LexicalizedParser.main(testCommandLine);

    teeOutPS.flush();
    teeErrPS.flush();
    teeOut.flush();
    teeErr.flush();

    System.setOut(originalOut);
    System.setErr(originalErr);
  }

  /**
   * This tests that building and running a simple English parser
   * model works correctly.
   */
  public void testBuildEnglishParser()
    throws IOException
  {
    for (String englishCommandLine : englishCommandLines) {
      ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(englishCommandLine, englishOneTree, englishThreeTrees);
      buildAndTest(test);
    }

    for (String englishCommandLine : englishTwoTreebanks) {
      ParserTestCase test = ParserTestCase.buildTwoTreebankTestCase(englishCommandLine, englishOneTree, englishSecondTree, englishThreeTrees);
      buildAndTest(test);
    }
  }

  public void testBuildChineseParser()
    throws IOException
  {
    for (String chineseCommandLine : chineseCommandLines) {
      ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(chineseCommandLine, chineseOneTree, chineseThreeTrees);
      buildAndTest(test);
    }
  }

  public void testBuildGermanParser()
    throws IOException
  {
    for (String germanCommandLine : germanCommandLines) {
      ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(germanCommandLine, germanOneTree, germanThreeTrees);
      buildAndTest(test);
    }
  }

  public void testBuildFrenchParser()
    throws IOException
  {
    for (String frenchCommandLine : frenchCommandLines) {
      ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(frenchCommandLine, frenchOneTree, frenchThreeTrees);
      buildAndTest(test);
    }
  }

  public void testBuildArabicParser()
    throws IOException
  {
    for (String arabicCommandLine : arabicCommandLines) {
      ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(arabicCommandLine, arabicOneTree, arabicThreeTrees);
      buildAndTest(test);
    }
  }



}