// CharacterCoding.java // ---------------------------------- // (C) 22.10.2008 by Michael Peter Christen; mc@yacy.net // first published on http://yacy.net // Frankfurt, Germany, 2008 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.document.parser.html; import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; /** * Contains methods to convert between Unicode and XML/HTML encoding. */ public final class CharacterCoding { /** Ampersand pattern */ public final static Pattern ampPattern = Pattern.compile(Pattern.quote("&")); /** Ampersand character in unicode encoding. */ private static final char AMP_UNICODE = "\u0026".charAt(0); /** Ampersand character in HTML encoding. */ private static final String AMP_HTML = "&"; /** Space character in HTML encoding. */ private static final String SPACE_HTML = " "; /** Special characters which have to be mapped for XML. */ private static final String[] MAPPING4XML = { "\"", """, //quotation mark "\u003C", "<", //less than "\u003E", ">", //greater than }; /** Special characters which have to be mapped for HTML. */ private static final String[] MAPPING4HTML = { "\\", "\", // Backslash "\u005E", "^", // Caret "\u0060", "`", // Accent Grave ` "\u007B", "{", // { "\u007C", "|", // | "\u007D", "}", // } "\u007E", "~", // ~ "\u0082", "‚", "\u0083", "ƒ", "\u0084", "„", "\u0085", "…", "\u0086", "†", "\u0087", "‡", "\u0088", "ˆ", "\u0089", "‰", "\u008A", "Š", "\u008B", "‹", "\u008C", "Œ", "\u008D", "", "\u008E", "Ž", "\u0091", "‘", "\u0092", "’", "\u0093", "“", "\u0094", "”", "\u0095", "•", "\u0096", "–", "\u0097", "—", "\u0098", "˜", "\u0099", "™", "\u009A", "š", "\u009B", "›", "\u009C", "œ", "\u009D", "", "\u009E", "ž", "\u009F", "Ÿ", "\u00A1", "¡", //inverted (spanish) exclamation mark "\u00A2", "¢", //cent "\u00A3", "£", //pound "\u00A4", "¤", //currency "\u00A5", "¥", //yen "\u00A6", "¦", //broken vertical bar "\u00A7", "§", //section sign "\u00A8", "¨", //diaeresis (umlaut) "\u00A9", "©", //copyright sign "\u00AA", "ª", //feminine ordinal indicator "\u00AB", "«", //left-pointing double angle quotation mark "\u00AC", "¬", //not sign "\u00AD", "­", //soft hyphen "\u00AE", "®", //registered sign "\u00AF", "¯", //macron "\u00B0", "°", //degree sign "\u00B1", "±", //plus-minus sign "\u00B2", "²", //superscript two "\u00B3", "³", //superscript three "\u00B4", "´", //acute accent "\u00B5", "µ", //micro sign "\u00B6", "¶", //paragraph sign "\u00B7", "·", //middle dot "\u00B8", "¸", //cedilla "\u00B9", "¹", //superscript one "\u00BA", "º", //masculine ordinal indicator "\u00BB", "»", //right-pointing double angle quotation mark "\u00BC", "¼", //fraction 1/4 "\u00BD", "½", //fraction 1/2 "\u00BE", "¾", //fraction 3/4 "\u00BF", "¿", //inverted (spanisch) questionmark "\u00C0", "À", "\u00C1", "Á", "\u00C2", "Â", "\u00C3", "Ã", "\u00C4", "Ä", "\u00C5", "Å", "\u00C6", "Æ", "\u00C7", "Ç", "\u00C8", "È", "\u00C9", "É", "\u00CA", "Ê", "\u00CB", "Ë", "\u00CC", "Ì", "\u00CD", "Í", "\u00CE", "Î", "\u00CF", "Ï", "\u00D0", "Ð", "\u00D1", "Ñ", "\u00D2", "Ò", "\u00D3", "Ó", "\u00D4", "Ô", "\u00D5", "Õ", "\u00D6", "Ö", "\u00D7", "×", "\u00D8", "Ø", "\u00D9", "Ù", "\u00DA", "Ú", "\u00DB", "Û", "\u00DC", "Ü", "\u00DD", "Ý", "\u00DE", "Þ", "\u00DF", "ß", "\u00E0", "à", "\u00E1", "á", "\u00E2", "â", "\u00E3", "ã", "\u00E4", "ä", "\u00E5", "å", "\u00E6", "æ", "\u00E7", "ç", "\u00E8", "è", "\u00E9", "é", "\u00EA", "ê", "\u00EB", "ë", "\u00EC", "ì", "\u00ED", "í", "\u00EE", "î", "\u00EF", "ï", "\u00F0", "ð", "\u00F1", "ñ", "\u00F2", "ò", "\u00F3", "ó", "\u00F4", "ô", "\u00F5", "õ", "\u00F6", "ö", "\u00F7", "÷", "\u00F8", "ø", "\u00F9", "ù", "\u00FA", "ú", "\u00FB", "û", "\u00FC", "ü", "\u00FD", "ý", "\u00FE", "þ", "\u00FF", "ÿ" }; /** Mapping for XML to unicode. */ private static final Map<String, Character> HTML2UNICODE4XML = new HashMap<String, Character>(MAPPING4XML.length * 2); /** Mapping for HTML to unicode. */ private static final Map<String, Character> HTML2UNICODE4HTML = new HashMap<String, Character>(MAPPING4HTML.length * 2); /** Mapping for unicode to XML. */ private static final Map<Character, String> UNICODE2HTML4XML = new HashMap<Character, String>(MAPPING4XML.length * 2); /** Mapping for unicode to HTML. */ private static final Map<Character, String> UNICODE2HTML4HTML = new HashMap<Character, String>(MAPPING4HTML.length * 2); static { Character c; for (int i = 0; i < MAPPING4HTML.length; i += 2) { c = Character.valueOf(MAPPING4HTML[i].charAt(0)); HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c); UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]); } for (int i = 0; i < MAPPING4XML.length; i += 2) { c = Character.valueOf(MAPPING4XML[i].charAt(0)); HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c); UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]); } } /** Private constructor to avoid instantiation of utility * class with only static methods. */ private CharacterCoding() { } /** * Replaces characters which have special representation in XML. * @see #MAPPING4XML * @param text text with character to replace * @param amp true if ampersands shall be replaced, else false * @return text with replaced characters */ public static String unicode2xml(final String text, final boolean amp) { return unicode2html(text, amp, false); } /** * Replaces characters which have special representation in HTML. * @see #MAPPING4HTML * @param text text with character to replace * @param amp true if ampersands shall be replaced, else false * @return text with replaced characters */ public static String unicode2html(final String text, final boolean amp) { return unicode2html(text, amp, true); } /** * Replaces characters which have special representation in HTML or XML. * @param text text with character to replace * @param amp true if ampersands shall be replaced, else false * @param html true if characters shall be replaced for embedding in * HTML, false for XML (far more characters are replaced for HTML, * compare {@link #MAPPING4HTML} with {@link #MAPPING4XML} * @return text with replaced characters */ private static String unicode2html( final String text, final boolean amp, final boolean html) { if (text == null) return null; final StringBuilder sb = new StringBuilder(text.length() * 12 / 10); int textpos = 0; String r; char c; while (textpos < text.length()) { // find a (forward) mapping c = text.charAt(textpos); if (amp && c == AMP_UNICODE) { sb.append(AMP_HTML); textpos++; continue; } if ((r = UNICODE2HTML4XML.get(c)) != null) { sb.append(r); textpos++; continue; } if (html && (r = UNICODE2HTML4HTML.get(c)) != null) { sb.append(r); textpos++; continue; } sb.append(c); textpos++; } return sb.toString(); } /** * Replaces HTML-encoded characters with unicode representation. * @param text text with character to replace * @return text with replaced characters */ public static String html2unicode(String text) { if (text == null) return null; text = ampPattern.matcher(text).replaceAll("&"); // sometimes a double-replacement is necessary. int p = 0, p1, q; final StringBuilder sb = new StringBuilder(text.length()); String s; Character r; while (p < text.length()) { p1 = text.indexOf('&', p); if (p1 < 0) { sb.append(text, p, text.length()); break; } sb.append(text, p, p1); p = p1; if (p >= text.length()) { break; } q = text.indexOf(';', p); if (q < 0) { // if there is now no semicolon, then this will also fail when another ampersand is found afterwards // we are finished here sb.append(text, p, text.length()); break; } s = text.substring(p, q + 1); p = q + 1; // check if another ampersand is in between int pp; while ((pp = s.indexOf('&', 1)) >= 0) { // we skip the first ampersand sb.append(s.substring(0, pp)); s = s.substring(pp); } if (s.equals(AMP_HTML)) { sb.append(AMP_UNICODE); continue; } if (s.equals(SPACE_HTML)) { sb.append(" "); continue; } if ((r = HTML2UNICODE4XML.get(s)) != null) { sb.append(r.charValue()); continue; } if ((r = HTML2UNICODE4HTML.get(s)) != null) { sb.append(r); continue; } if (s.charAt(1) == '#') { if (s.charAt(2) == 'x' || s.charAt(2) == 'X') { sb.append(new char[] {(char) Integer.parseInt(s.substring(3, s.length() - 1), 16)}); continue; } String ucs = s.substring(2, s.length() - 1); try { int uc = Integer.parseInt(ucs); sb.append(new char[] {(char) uc}); } catch (final NumberFormatException e) { } continue; } // the entity is unknown, copy it sb.append(s); } return sb.toString(); } /** * Test method. Ignore it if you don't need it. * @param args will be ignored */ public static void main(final String[] args) { final String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen"; final String txet = unicode2html(text, true); System.out.println(txet); System.out.println(html2unicode(txet)); if (html2unicode(txet).equals(text)) { System.out.println("correct"); } final String text2 = "encodeUnicode2xml: & \" < >"; System.out.println(text2); System.out.println(unicode2xml(text2, true)); final String text3 = "space täst"; System.out.println(text3); System.out.println(html2unicode(text3)); } }