package edu.stanford.nlp.parser.lexparser;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import edu.stanford.nlp.io.IOUtils;
import junit.framework.TestCase;
/** Test that the parser does obey character encodings.
*
* @author Christopher Manning
*/
public class LexicalizedParserCharacterEncodingITest extends TestCase {
private static final String input = "café";
private static final byte[] utf8Bytes = { 0x28, 0x52, 0x4f, 0x4f, 0x54, 0x0a,
0x20, 0x20, 0x28, 0x4e, 0x50, 0x20, 0x28, 0x4e, 0x4e, 0x50, 0x20, 0x63, 0x61, 0x66, (byte) (0xc3 - 256), (byte) (0xa9 - 256), 0x29, 0x29, 0x29, 0x0a,
};
private static final byte[] iso8859Bytes = { 0x28, 0x52, 0x4f, 0x4f, 0x54, 0x0a,
0x20, 0x20, 0x28, 0x4e, 0x50, 0x20, 0x28, 0x4e, 0x4e, 0x50, 0x20, 0x63, 0x61, 0x66, (byte) (0xe9 - 256), 0x29, 0x29, 0x29, 0x0a,
};
private static final byte[] gb18030Bytes = { 0x28, 0x52, 0x4f, 0x4f, 0x54, 0x0a,
0x20, 0x20, 0x28, 0x4e, 0x50, 0x20, 0x28, 0x4e, 0x4e, 0x50, 0x20, 0x63, 0x61, 0x66, (byte) (0xa8 - 256), (byte) (0xa6 - 256), 0x29, 0x29, 0x29, 0x0a,
};
public void testCharEncodingUtf8() throws IOException {
tryCharEncoding("utf-8", utf8Bytes);
}
public void testCharEncodingIso8859() throws IOException {
tryCharEncoding("iso-8859-1", iso8859Bytes);
}
public void testCharEncodingGB18030() throws IOException {
tryCharEncoding("gb18030", gb18030Bytes);
}
private static void tryCharEncoding(String encoding, byte[] expected) throws IOException {
byte[] contents = new byte[128]; // Make big enough for something reasonable!
File tmpInput = File.createTempFile("parser", null);
// tmpInput.deleteOnExit();
PrintWriter pw = IOUtils.getPrintWriter(tmpInput, encoding);
pw.println(input);
pw.close();
File tmpFile = File.createTempFile("parser", null);
System.err.println("Sending output to " + tmpFile.getCanonicalPath());
// tmpFile.deleteOnExit();
PrintStream ps = new PrintStream(tmpFile);
System.setOut(ps);
// todo: need to specify encoding on command-line to give it a chance!
LexicalizedParser.main(new String[]{"-encoding", encoding, "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", tmpInput.getCanonicalPath()});
ps.close();
InputStream is = new FileInputStream(tmpFile);
int offset = 0;
int numRead;
do {
int length = contents.length - offset;
numRead = is.read(contents, offset, length);
offset += numRead;
} while (numRead > 0);
is.close();
for (int i = 0; i < Math.min(expected.length, offset); i++) {
assertEquals("Byte " + i + " should be " + expected[i] + " but was " + contents[i] + ".",
expected[i], contents[i]);
}
if (expected.length > offset) System.err.println("First non-received byte was " + expected[offset]);
if (expected.length < offset) System.err.println("First wrongly received byte was " + contents[expected.length]);
assertEquals("Was expecting " + expected.length + " bytes but got " + offset + " bytes.",
expected.length, offset);
}
}