package com.orgzly.android.util; import org.junit.Test; import java.io.File; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.assertFalse; import static junit.framework.Assert.assertTrue; /** * chardet * * ascii.org: ascii (confidence: 1.00) * Chinese-Lipsum.org: utf-8 (confidence: 0.99) * few_chinese_characters.org: utf-8 (confidence: 0.99) * org-blog-articles.org: ISO-8859-2 (confidence: 0.79) * org-issues.org: utf-8 (confidence: 0.99) * org-people.org: ISO-8859-2 (confidence: 0.85) */ public class EncodingDetectTest { private static final String PATH = "assets/encoding"; private InputStream getFromResource(String name) { String resourcePath = new File(PATH, name).getPath(); InputStream is = this.getClass().getClassLoader().getResourceAsStream(resourcePath); if (is == null) { throw new IllegalArgumentException("Resource " + resourcePath + " could not be loaded"); } return is; } @Test public void testOrgBlogArticles() throws FileNotFoundException { EncodingDetect detect = EncodingDetect.getInstance(getFromResource("org-blog-articles.org")); switch (EncodingDetect.USED_METHOD) { // case ICU: // assertTrue(detect.isDetected()); // assertEquals("ISO-8859-1", detect.getEncoding()); // break; // case JCHARDET: // /* This is not detected and UTF-8 is being used by default, which produces weird chars. */ // assertTrue(detect.isDetected()); // assertEquals("windows-1252", detect.getEncoding()); // break; case JUNIVERSALCHARDET: assertTrue(detect.isDetected()); assertEquals("WINDOWS-1252", detect.getEncoding()); break; } } @Test public void testPeople() throws FileNotFoundException { EncodingDetect detect = EncodingDetect.getInstance(getFromResource("org-people.org")); switch (EncodingDetect.USED_METHOD) { // case ICU: // assertTrue(detect.isDetected()); // assertEquals("ISO-8859-1", detect.getEncoding()); // break; // case JCHARDET: // assertTrue(detect.isDetected()); // assertEquals("windows-1252", detect.getEncoding()); // break; case JUNIVERSALCHARDET: assertTrue(detect.isDetected()); assertEquals("WINDOWS-1252", detect.getEncoding()); break; } } @Test public void testIssues() throws FileNotFoundException { EncodingDetect detect = EncodingDetect.getInstance(getFromResource("org-issues.org")); switch (EncodingDetect.USED_METHOD) { // case ICU: // assertTrue(detect.isDetected()); // assertEquals("ISO-8859-1", detect.getEncoding()); // break; // case JCHARDET: // assertFalse(detect.isDetected()); // break; case JUNIVERSALCHARDET: assertTrue(detect.isDetected()); assertEquals("UTF-8", detect.getEncoding()); break; } } @Test public void testAscii() throws FileNotFoundException { EncodingDetect detect = EncodingDetect.getInstance(getFromResource("ascii.org")); switch (EncodingDetect.USED_METHOD) { // case ICU: // assertTrue(detect.isDetected()); // assertEquals("ISO-8859-1", detect.getEncoding()); // break; // case JCHARDET: // assertTrue(detect.isDetected()); // assertEquals("ASCII", detect.getEncoding()); // break; case JUNIVERSALCHARDET: assertFalse(detect.isDetected()); break; } } @Test public void testChinese() throws FileNotFoundException { EncodingDetect detect = EncodingDetect.getInstance(getFromResource("few_chinese_characters.org")); assertTrue(detect.isDetected()); assertEquals("UTF-8", detect.getEncoding()); } @Test public void testChineseLipsum() throws FileNotFoundException { EncodingDetect detect = EncodingDetect.getInstance(getFromResource("Chinese-Lipsum.org")); assertTrue(detect.isDetected()); assertEquals("UTF-8", detect.getEncoding()); } // public void testAll() { // for (String s: EncodingDetect.getAll()) { // System.out.println(s); // } // } @Test public void ISO_8859_15_dos() throws UnsupportedEncodingException { EncodingDetect detect = EncodingDetect.getInstance(getFromResource("iso-8859-15-dos.org")); assertTrue(detect.isDetected()); assertEquals("WINDOWS-1252", detect.getEncoding()); } }