LexicalizedParserCharacterEncodingITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.PrintWriter;

import edu.stanford.nlp.io.IOUtils;
import junit.framework.TestCase;

/** Test that the parser does obey character encodings.
 *
 *  @author Christopher Manning
 */
public class LexicalizedParserCharacterEncodingITest extends TestCase {

  private static final String input = "café";

  private static final byte[] utf8Bytes = { 0x28, 0x52, 0x4f, 0x4f, 0x54, 0x0a,
          0x20, 0x20,  0x28, 0x4e, 0x50, 0x20, 0x28, 0x4e, 0x4e, 0x50, 0x20, 0x63, 0x61, 0x66, (byte) (0xc3 - 256), (byte) (0xa9 - 256), 0x29, 0x29, 0x29, 0x0a,
  };

  private static final byte[] iso8859Bytes = { 0x28, 0x52, 0x4f, 0x4f, 0x54, 0x0a,
          0x20, 0x20,  0x28, 0x4e, 0x50, 0x20, 0x28, 0x4e, 0x4e, 0x50, 0x20, 0x63, 0x61, 0x66, (byte) (0xe9 - 256), 0x29, 0x29, 0x29, 0x0a,
  };

  private static final byte[] gb18030Bytes = { 0x28, 0x52, 0x4f, 0x4f, 0x54, 0x0a,
          0x20, 0x20,  0x28, 0x4e, 0x50, 0x20, 0x28, 0x4e, 0x4e, 0x50, 0x20, 0x63, 0x61, 0x66, (byte) (0xa8 - 256), (byte) (0xa6 - 256), 0x29, 0x29, 0x29, 0x0a,
  };


  public void testCharEncodingUtf8() throws IOException {
    tryCharEncoding("utf-8", utf8Bytes);
  }

  public void testCharEncodingIso8859() throws IOException {
    tryCharEncoding("iso-8859-1", iso8859Bytes);
  }

  public void testCharEncodingGB18030() throws IOException {
    tryCharEncoding("gb18030", gb18030Bytes);
  }

  private static void tryCharEncoding(String encoding, byte[] expected) throws IOException {
    byte[] contents = new byte[128]; // Make big enough for something reasonable!
    File tmpInput = File.createTempFile("parser", null);
    // tmpInput.deleteOnExit();
    PrintWriter pw = IOUtils.getPrintWriter(tmpInput, encoding);
    pw.println(input);
    pw.close();

    File tmpFile = File.createTempFile("parser", null);
    System.err.println("Sending output to " + tmpFile.getCanonicalPath());
    // tmpFile.deleteOnExit();

    PrintStream ps = new PrintStream(tmpFile);
    System.setOut(ps);
    // todo: need to specify encoding on command-line to give it a chance!
    LexicalizedParser.main(new String[]{"-encoding", encoding, "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", tmpInput.getCanonicalPath()});
    ps.close();

    InputStream is = new FileInputStream(tmpFile);
    int offset = 0;
    int numRead;
    do {
      int length = contents.length - offset;
      numRead = is.read(contents, offset, length);
      offset += numRead;
    } while (numRead > 0);
    is.close();
    for (int i = 0; i < Math.min(expected.length, offset); i++) {
      assertEquals("Byte " + i + " should be " + expected[i] + " but was " + contents[i] + ".",
              expected[i], contents[i]);
    }
    if (expected.length > offset) System.err.println("First non-received byte was " + expected[offset]);
    if (expected.length < offset) System.err.println("First wrongly received byte was " + contents[expected.length]);
    assertEquals("Was expecting " + expected.length + " bytes but got " + offset + " bytes.",
            expected.length, offset);
  }

}