/**
*
*/
package org.archive.wayback.replay.charset;
import java.io.IOException;
import junit.framework.TestCase;
import org.archive.io.warc.TestWARCReader;
import org.archive.io.warc.TestWARCRecordInfo;
import org.archive.io.warc.WARCRecord;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.resourcestore.resourcefile.WarcResource;
/**
* test for {@link RotatingCharsetDetector}.
*/
public class RotatingCharsetDetectorTest extends TestCase {
protected WarcResource createResource(String payload, String encoding) throws IOException {
return createResource("text/html", payload, encoding);
}
protected WarcResource createResource(String contentType, String payload, String encoding) throws IOException {
final byte[] payloadBytes = payload.getBytes(encoding);
TestWARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse(contentType, payloadBytes);
TestWARCReader wr = new TestWARCReader(recinfo);
WARCRecord rec = wr.get(0);
WarcResource resource = new WarcResource(rec, wr);
resource.parseHeaders();
return resource;
}
/**
* content is UTF-8 encoded, but META tag says it's UTF-16.
* {@link PrescanMetadataSniffer} overrides UTF-16 to UTF-8.
* @throws Exception
*/
public void testFalseMetaUTF16() throws Exception {
final String payload = "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
"<head>" +
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-16\" />" +
" <title>Test Document</title>" +
" <link rel=\"stylesheet\" type=\"text/css\" href=\"styles.css\" />" +
"</head>" +
"<body>" +
"</body>" +
"</html>";
WarcResource resource = createResource(payload, "UTF-8");
WaybackRequest wbRequest = new WaybackRequest();
RotatingCharsetDetector cut = new RotatingCharsetDetector();
String charset = cut.getCharset(resource, wbRequest);
assertEquals("UTF-8", charset);
}
/**
* test of {@code x-} charset names.
* @throws Exception
*/
public void testXCharsetName() throws Exception {
final String payload = "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
"<head>" +
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=x-sjis\" />" +
" <title>Test Document</title>" +
" <link rel=\"stylesheet\" type=\"text/css\" href=\"styles.css\" />" +
"</head>" +
"<body>" +
"</body>" +
"</html>";
WarcResource resource = createResource(payload, "x-sjis");
WaybackRequest wbRequest = new WaybackRequest();
RotatingCharsetDetector cut = new RotatingCharsetDetector();
String charset = cut.getCharset(resource, wbRequest);
assertEquals("x-sjis", charset);
}
/**
* test of {@code x-user-defined} charset name.
* mapped to {@code windows-1252}.
* @throws Exception
*/
public void testXUserDefined() throws Exception {
final String payload = "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
"<head>" +
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=x-user-defined\" />" +
" <title>Test Document</title>" +
" <link rel=\"stylesheet\" type=\"text/css\" href=\"styles.css\" />" +
"</head>" +
"<body>" +
"</body>" +
"</html>";
WarcResource resource = createResource(payload, "windows-1252");
WaybackRequest wbRequest = new WaybackRequest();
RotatingCharsetDetector cut = new RotatingCharsetDetector();
String charset = cut.getCharset(resource, wbRequest);
assertEquals("windows-1252", charset);
}
/**
* content is UTF-16 encoded, but META tag says it's UTF-8.
* {@link PrescanMetadataSniffer} shall fail because it's UTF-16 encoded,
* and {@link UniversalChardetSniffer} should detect UTF-16.
* <p>Unfortunately, this test fails currently. Universal Chardet returns
* {@code null} for sample content, even with some non-ASCII chars. Hopefully
* UTF-16 texts have BOM, or plenty of non-ASCII chars make Universal Chardet
* work.</p>
* @throws Exception
*/
public void testFalseMetaUTF8() throws Exception {
final String payload = "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
"<head>" +
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />" +
" <title>Test Document</title>" +
" <link rel=\"stylesheet\" type=\"text/css\" href=\"styles.css\" />" +
"</head>" +
"<body>" +
"</body>" +
"</html>";
WarcResource resource = createResource(payload, "UTF-16BE");
WaybackRequest wbRequest = new WaybackRequest();
RotatingCharsetDetector cut = new RotatingCharsetDetector();
String charset = cut.getCharset(resource, wbRequest);
//assertEquals("UTF-16BE", charset);
}
/**
* test of {@link ContentTypeHeaderSniffer}
* @throws Exception
*/
public void testContentTypeHeaderSniffer() throws Exception {
ContentTypeHeaderSniffer cut = new ContentTypeHeaderSniffer();
final String payload = "<html>" +
"<body>" +
"</body>" +
"</html>";
{
WarcResource resource = createResource("text/html", payload, "UTF-8");
String enc = cut.sniff(resource);
assertNull(enc);
}
{
WarcResource resource = createResource("text/html;charset=utf-8", payload, "UTF-8");
String enc = cut.sniff(resource);
assertEquals("utf-8", enc);
}
{
WarcResource resource = createResource("text/html; charset=shift_jis", payload, "shift_jis");
String enc = cut.sniff(resource);
assertEquals("shift_jis", enc);
}
{
// rescuing broken charset name
WarcResource resource = createResource("text/html; charset=i so-8859-1", payload, "iso-8859-1");
String enc = cut.sniff(resource);
// sniffer maps "iso-8859-1" to "cp1252";
assertEquals("windows-1252", enc);
}
}
// more tests?
}