package hextostring.convert; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import hextostring.utils.Charsets; /** * Standard converter for UTF-8-encoded hexadecimal strings. * * @author Maxime PIA */ public class UTF8Converter extends AbstractConverter { public UTF8Converter() { super(Charsets.UTF8); } @Override protected List<String> extractConvertibleChunks(String hex) { List<String> results = new LinkedList<>(); Matcher m = Pattern.compile( "(" + // U+0000 to U+007F: 0xxxxxxx "[0-7][0-9a-f]|" + // U+0080 to U+07FF: 110xxxxx 10xxxxxx "(c[0-2]|d[0-9a-f])[8-9a-b][0-9a-f]|" + // U+0800 to U+0FFF: 11100000 101xxxxx 10xxxxxx "e0[a-b][0-9a-f][8-9a-b][0-9a-f]|" + // U+1000 to U+1FFF: 11100001 10xxxxxx 10xxxxxx "e1([8-9a-b][0-9a-f]){2}|" + // U+2000 to U+3FFF: 1110001x 10xxxxxx 10xxxxxx "e[2-3]([8-9a-b][0-9a-f]){2}|" + // U+4000 to U+7FFF: 111001xx 10xxxxxx 10xxxxxx "e[4-7]([8-9a-b][0-9a-f]){2}|" + // U+8000 to U+BFFF: 111010xx 10xxxxxx 10xxxxxx "e[8-9a-b]([8-9a-b][0-9a-f]){2}|" + // U+C000 to U+CFFF: 11101100 10xxxxxx 10xxxxxx "ec([8-9a-b][0-9a-f]){2}|" + // U+D000 to U+D7FF: 11101101 100xxxxx 10xxxxxx "ed[8-9][0-9a-f][8-9a-b][0-9a-f]|" + // U+E000 to U+FFFF: 1110111x 10xxxxxx 10xxxxxx "e[e-f]([8-9a-b][0-9a-f]){2}|" + // U+10000 to U+1FFFF: 11110000 10(01|10|11)xxxx 10xxxxxx 10xxxxxx "f0[9a-b][0-9a-f]([8-9a-b][0-9a-f]){2}|" + // U+40000 to U+7FFFF: 11110001 10xxxxxx 10xxxxxx 10xxxxxx "f1([8-9a-b][0-9a-f]){3}|" + // U+80000 to U+FFFFF: 1111001x 10xxxxxx 10xxxxxx 10xxxxxx "f[2-3]([8-9a-b][0-9a-f]){3}|" + // U+100000 to U+10FFFF: 11110100 1000xxxx 10xxxxxx 10xxxxxx "f48[0-9a-f]([8-9a-b][0-9a-f]){2}|" + ")+?(0000|$)" ).matcher(hex); String match; while (m.find()) { match = m.group(); if (match.endsWith("0000")) { match = match.substring(0, match.length() - 4); } if (!match.contains("ffff") && hex.indexOf(match) % 2 == 0) { results.add(match); } } return results; } }