package org.jsoup.helper;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.junit.Test;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import static org.jsoup.integration.ParseTest.getFile;
import static org.junit.Assert.*;
public class DataUtilTest {
@Test
public void testCharset() {
assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 "));
assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8"));
assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1"));
assertEquals(null, DataUtil.getCharsetFromContentType("text/html"));
assertEquals(null, DataUtil.getCharsetFromContentType(null));
assertEquals(null, DataUtil.getCharsetFromContentType("text/html;charset=Unknown"));
}
@Test public void testQuotedCharset() {
assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\""));
assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"UTF-8\""));
assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\""));
assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=\"Unsupported\""));
assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'"));
}
@Test public void discardsSpuriousByteOrderMark() {
String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
Document doc = DataUtil.parseByteData(buffer, "UTF-8", "http://foo.com/", Parser.htmlParser());
assertEquals("One", doc.head().text());
}
@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
assertEquals("One", doc.head().text());
assertEquals("UTF-8", doc.outputSettings().charset().displayName());
}
@Test
public void shouldNotThrowExceptionOnEmptyCharset() {
assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset="));
assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=;"));
}
@Test
public void shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags() {
assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1, charset=1251"));
}
@Test
public void shouldCorrectCharsetForDuplicateCharsetString() {
assertEquals("iso-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=charset=iso-8859-1"));
}
@Test
public void shouldReturnNullForIllegalCharsetNames() {
assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset=$HJKDF§$/("));
}
@Test
public void generatesMimeBoundaries() {
String m1 = DataUtil.mimeBoundary();
String m2 = DataUtil.mimeBoundary();
assertEquals(DataUtil.boundaryLength, m1.length());
assertEquals(DataUtil.boundaryLength, m2.length());
assertNotSame(m1, m2);
}
@Test
public void wrongMetaCharsetFallback() {
try {
final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8");
final ByteBuffer inBuffer = ByteBuffer.wrap(input);
Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser());
final String expected = "<html>\n" +
" <head>\n" +
" <meta charset=\"iso-8\">\n" +
" </head>\n" +
" <body></body>\n" +
"</html>";
assertEquals(expected, doc.toString());
} catch( UnsupportedEncodingException ex ) {
fail(ex.getMessage());
}
}
@Test
public void supportsBOMinFiles() throws IOException {
// test files from http://www.i18nl10n.com/korean/utftest/
File in = getFile("/bomtests/bom_utf16be.html");
Document doc = Jsoup.parse(in, null, "http://example.com");
assertTrue(doc.title().contains("UTF-16BE"));
assertTrue(doc.text().contains("가각갂갃간갅"));
in = getFile("/bomtests/bom_utf16le.html");
doc = Jsoup.parse(in, null, "http://example.com");
assertTrue(doc.title().contains("UTF-16LE"));
assertTrue(doc.text().contains("가각갂갃간갅"));
in = getFile("/bomtests/bom_utf32be.html");
doc = Jsoup.parse(in, null, "http://example.com");
assertTrue(doc.title().contains("UTF-32BE"));
assertTrue(doc.text().contains("가각갂갃간갅"));
in = getFile("/bomtests/bom_utf32le.html");
doc = Jsoup.parse(in, null, "http://example.com");
assertTrue(doc.title().contains("UTF-32LE"));
assertTrue(doc.text().contains("가각갂갃간갅"));
}
}