/* * Copyright 2005 Sun Microsystems, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.sun.syndication.io.impl; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; import java.util.regex.Matcher; /** * @author Alejandro Abdelnur */ public class XmlFixerReader extends Reader { protected Reader in; public XmlFixerReader(Reader in) { super(in); this.in = in; _buffer = new StringBuffer(); _state = 0; } private boolean trimmed; private StringBuffer _buffer; private int _bufferPos; private int _state = 0; private boolean trimStream() throws IOException { boolean hasContent = true; int state = 0; boolean loop; int c; do { switch (state) { case 0: c = in.read(); if (c==-1) { loop = false; hasContent = false; } else if (c==' ' || c=='\n') { loop = true; } else if (c=='<') { state = 1; _buffer.setLength(0); _bufferPos = 0; _buffer.append((char)c); loop = true; } else { _buffer.setLength(0); _bufferPos = 0; _buffer.append((char)c); loop = false; hasContent = true; _state = 3; } break; case 1: c = in.read(); if (c==-1) { loop = false; hasContent = true; _state = 3; } else if (c!='!') { _buffer.append((char)c); _state = 3; loop = false; hasContent = true; _state = 3; } else { _buffer.append((char)c); state = 2; loop = true; } break; case 2: c = in.read(); if (c==-1) { loop = false; hasContent = true; _state = 3; } else if (c=='-') { _buffer.append((char)c); state = 3; loop = true; } else { _buffer.append((char)c); loop = false; hasContent = true; _state = 3; } break; case 3: c = in.read(); if (c==-1) { loop = false; hasContent = true; _state = 3; } else if (c=='-') { _buffer.append((char)c); state = 4; loop = true; } else { _buffer.append((char)c); loop = false; hasContent = true; _state = 3; } break; case 4: c = in.read(); if (c==-1) { loop = false; hasContent = true; _state = 3; } else if (c!='-') { _buffer.append((char)c); loop = true; } else { _buffer.append((char)c); state = 5; loop = true; } break; case 5: c = in.read(); if (c==-1) { loop = false; hasContent = true; _state = 3; } else if (c!='-') { _buffer.append((char)c); loop = true; state = 4; } else { _buffer.append((char)c); state = 6; loop = true; } break; case 6: c = in.read(); if (c==-1) { loop = false; hasContent = true; _state = 3; } else if (c!='>') { _buffer.append((char)c); loop = true; state = 4; } else { _buffer.setLength(0); state = 0; loop = true; } break; default: throw new IOException("It shouldn't happen"); } } while (loop); return hasContent; } public int read() throws IOException { boolean loop; if (!trimmed) { // trims XML stream trimmed = true; if (!trimStream()) { return -1; } } int c; do { // converts literal entities to coded entities switch (_state) { case 0: // reading chars from stream c = in.read(); if (c>-1) { if (c=='&') { _state = 1; _buffer.setLength(0); _bufferPos = 0; _buffer.append((char)c); _state = 1; loop = true; } else { loop = false; } } else { loop = false; } break; case 1: // reading entity from stream c = in.read(); if (c>-1) { if (c==';') { _buffer.append((char)c); _state = 2; loop = true; } else if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) { _buffer.append((char)c); loop = true; } else { // no ';' to match the '&' lets just make the '&' // a legal xml character entity '&' _buffer.insert(1, "amp;"); _buffer.append((char)c); _state = 3; loop = true; } } else { // no ';' to match the '&' lets just make the '&' // a legal xml character entity '&' _buffer.insert(1, "amp;"); _state = 3; loop = true; } break; case 2: // replacing entity c = 0; String literalEntity = _buffer.toString(); String codedEntity = (String) CODED_ENTITIES.get(literalEntity); if (codedEntity!=null) { _buffer.setLength(0); _buffer.append(codedEntity); } // else we leave what was in the stream _state = 3; loop = true; break; case 3: // consuming buffer if (_bufferPos<_buffer.length()) { c = _buffer.charAt(_bufferPos++); loop = false; } else { c = 0; _state = 0; loop = true; } break; default: throw new IOException("It shouldn't happen"); } } while (loop); return c; } public int read(char[] buffer,int offset,int len) throws IOException { int charsRead = 0; int c = read(); if (c==-1) { return -1; } buffer[offset+(charsRead++)] = (char) c; while (charsRead<len && (c=read())>-1) { buffer[offset+(charsRead++)] = (char) c; } return charsRead; } public long skip(long n) throws IOException { if (n==0) { return 0; } else if (n<0) { throw new IllegalArgumentException("'n' cannot be negative"); } int c = read(); long counter = 1; while (c>-1 && counter<n) { c = read(); counter++; } return counter; } public boolean ready() throws IOException { return (_state!=0) || in.ready(); } public boolean markSupported() { return false; } public void mark(int readAheadLimit) throws IOException { throw new IOException("Stream does not support mark"); } public void reset() throws IOException { throw new IOException("Stream does not support mark"); } public void close() throws IOException { in.close(); } private static Map CODED_ENTITIES = new HashMap(); static { // note: refer to Character entity references in HTML 4 // at http://www.w3.org/TR/REC-html40/sgml/entities.html // Character entity set. // HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML" CODED_ENTITIES.put(" ", " "); CODED_ENTITIES.put("¡", "¡"); CODED_ENTITIES.put("¢", "¢"); CODED_ENTITIES.put("£", "£"); CODED_ENTITIES.put("¤","¤"); CODED_ENTITIES.put("¥", "¥"); CODED_ENTITIES.put("¦","¦"); CODED_ENTITIES.put("§", "§"); CODED_ENTITIES.put("¨", "¨"); CODED_ENTITIES.put("©", "©"); CODED_ENTITIES.put("ª", "ª"); CODED_ENTITIES.put("«", "«"); CODED_ENTITIES.put("¬", "¬"); CODED_ENTITIES.put("­", "­"); CODED_ENTITIES.put("®", "®"); CODED_ENTITIES.put("¯", "¯"); CODED_ENTITIES.put("°", "°"); CODED_ENTITIES.put("±","±"); CODED_ENTITIES.put("²", "²"); CODED_ENTITIES.put("³", "³"); CODED_ENTITIES.put("´", "´"); CODED_ENTITIES.put("µ", "µ"); CODED_ENTITIES.put("¶", "¶"); CODED_ENTITIES.put("·","·"); CODED_ENTITIES.put("¸", "¸"); CODED_ENTITIES.put("¹", "¹"); CODED_ENTITIES.put("º", "º"); CODED_ENTITIES.put("»", "»"); CODED_ENTITIES.put("¼","¼"); CODED_ENTITIES.put("½","½"); CODED_ENTITIES.put("¾","¾"); CODED_ENTITIES.put("¿","¿"); CODED_ENTITIES.put("À","À"); CODED_ENTITIES.put("Á","Á"); CODED_ENTITIES.put("Â", "Â"); CODED_ENTITIES.put("Ã","Ã"); CODED_ENTITIES.put("Ä", "Ä"); CODED_ENTITIES.put("Å", "Å"); CODED_ENTITIES.put("Æ", "Æ"); CODED_ENTITIES.put("Ç","Ç"); CODED_ENTITIES.put("È","È"); CODED_ENTITIES.put("É","É"); CODED_ENTITIES.put("Ê", "Ê"); CODED_ENTITIES.put("Ë", "Ë"); CODED_ENTITIES.put("Ì","Ì"); CODED_ENTITIES.put("Í","Í"); CODED_ENTITIES.put("Î", "Î"); CODED_ENTITIES.put("Ï", "Ï"); CODED_ENTITIES.put("Ð", "Ð"); CODED_ENTITIES.put("Ñ","Ñ"); CODED_ENTITIES.put("Ò","Ò"); CODED_ENTITIES.put("Ó","Ó"); CODED_ENTITIES.put("Ô", "Ô"); CODED_ENTITIES.put("Õ","Õ"); CODED_ENTITIES.put("Ö", "Ö"); CODED_ENTITIES.put("×", "×"); CODED_ENTITIES.put("Ø","Ø"); CODED_ENTITIES.put("Ù","Ù"); CODED_ENTITIES.put("Ú","Ú"); CODED_ENTITIES.put("Û", "Û"); CODED_ENTITIES.put("Ü", "Ü"); CODED_ENTITIES.put("Ý","Ý"); CODED_ENTITIES.put("Þ", "Þ"); CODED_ENTITIES.put("ß", "ß"); CODED_ENTITIES.put("à","à"); CODED_ENTITIES.put("á","á"); CODED_ENTITIES.put("â", "â"); CODED_ENTITIES.put("ã","ã"); CODED_ENTITIES.put("ä", "ä"); CODED_ENTITIES.put("å", "å"); CODED_ENTITIES.put("æ", "æ"); CODED_ENTITIES.put("ç","ç"); CODED_ENTITIES.put("è","è"); CODED_ENTITIES.put("é","é"); CODED_ENTITIES.put("ê", "ê"); CODED_ENTITIES.put("ë", "ë"); CODED_ENTITIES.put("ì","ì"); CODED_ENTITIES.put("í","í"); CODED_ENTITIES.put("î", "î"); CODED_ENTITIES.put("ï", "ï"); CODED_ENTITIES.put("ð", "ð"); CODED_ENTITIES.put("ñ","ñ"); CODED_ENTITIES.put("ò","ò"); CODED_ENTITIES.put("ó","ó"); CODED_ENTITIES.put("ô", "ô"); CODED_ENTITIES.put("õ","õ"); CODED_ENTITIES.put("ö", "ö"); CODED_ENTITIES.put("÷","÷"); CODED_ENTITIES.put("ø","ø"); CODED_ENTITIES.put("ù","ù"); CODED_ENTITIES.put("ú","ú"); CODED_ENTITIES.put("û", "û"); CODED_ENTITIES.put("ü", "ü"); CODED_ENTITIES.put("ý","ý"); CODED_ENTITIES.put("þ", "þ"); CODED_ENTITIES.put("ÿ", "ÿ"); // Mathematical, Greek and Symbolic characters for HTML. // HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML" CODED_ENTITIES.put("ƒ", "ƒ"); CODED_ENTITIES.put("Α", "Α"); CODED_ENTITIES.put("Β", "Β"); CODED_ENTITIES.put("Γ", "Γ"); CODED_ENTITIES.put("Δ", "Δ"); CODED_ENTITIES.put("Ε", "Ε"); CODED_ENTITIES.put("Ζ", "Ζ"); CODED_ENTITIES.put("Η", "Η"); CODED_ENTITIES.put("Θ", "Θ"); CODED_ENTITIES.put("Ι", "Ι"); CODED_ENTITIES.put("Κ", "Κ"); CODED_ENTITIES.put("Λ", "Λ"); CODED_ENTITIES.put("Μ", "Μ"); CODED_ENTITIES.put("Ν", "Ν"); CODED_ENTITIES.put("Ξ", "Ξ"); CODED_ENTITIES.put("Ο", "Ο"); CODED_ENTITIES.put("Π", "Π"); CODED_ENTITIES.put("Ρ", "Ρ"); CODED_ENTITIES.put("Σ", "Σ"); CODED_ENTITIES.put("Τ", "Τ"); CODED_ENTITIES.put("Υ", "Υ"); CODED_ENTITIES.put("Φ", "Φ"); CODED_ENTITIES.put("Χ", "Χ"); CODED_ENTITIES.put("Ψ", "Ψ"); CODED_ENTITIES.put("Ω", "Ω"); CODED_ENTITIES.put("α", "α"); CODED_ENTITIES.put("β", "β"); CODED_ENTITIES.put("γ", "γ"); CODED_ENTITIES.put("δ", "δ"); CODED_ENTITIES.put("ε", "ε"); CODED_ENTITIES.put("ζ", "ζ"); CODED_ENTITIES.put("η", "η"); CODED_ENTITIES.put("θ", "θ"); CODED_ENTITIES.put("ι", "ι"); CODED_ENTITIES.put("κ", "κ"); CODED_ENTITIES.put("λ", "λ"); CODED_ENTITIES.put("μ", "μ"); CODED_ENTITIES.put("ν", "ν"); CODED_ENTITIES.put("ξ", "ξ"); CODED_ENTITIES.put("ο", "ο"); CODED_ENTITIES.put("π", "π"); CODED_ENTITIES.put("ρ", "ρ"); CODED_ENTITIES.put("ς", "ς"); CODED_ENTITIES.put("σ", "σ"); CODED_ENTITIES.put("τ", "τ"); CODED_ENTITIES.put("υ", "υ"); CODED_ENTITIES.put("φ", "φ"); CODED_ENTITIES.put("χ", "χ"); CODED_ENTITIES.put("ψ", "ψ"); CODED_ENTITIES.put("ω", "ω"); CODED_ENTITIES.put("ϑ", "ϑ"); CODED_ENTITIES.put("ϒ", "ϒ"); CODED_ENTITIES.put("ϖ", "ϖ"); CODED_ENTITIES.put("•", "•"); CODED_ENTITIES.put("…", "…"); CODED_ENTITIES.put("′", "′"); CODED_ENTITIES.put("″", "″"); CODED_ENTITIES.put("‾", "‾"); CODED_ENTITIES.put("⁄", "⁄"); CODED_ENTITIES.put("℘", "℘"); CODED_ENTITIES.put("ℑ", "ℑ"); CODED_ENTITIES.put("ℜ", "ℜ"); CODED_ENTITIES.put("™", "™"); CODED_ENTITIES.put("ℵ", "ℵ"); CODED_ENTITIES.put("←", "←"); CODED_ENTITIES.put("↑", "↑"); CODED_ENTITIES.put("→", "→"); CODED_ENTITIES.put("↓", "↓"); CODED_ENTITIES.put("↔", "↔"); CODED_ENTITIES.put("↵", "↵"); CODED_ENTITIES.put("⇐", "⇐"); CODED_ENTITIES.put("⇑", "⇑"); CODED_ENTITIES.put("⇒", "⇒"); CODED_ENTITIES.put("⇓", "⇓"); CODED_ENTITIES.put("⇔", "⇔"); CODED_ENTITIES.put("∀", "∀"); CODED_ENTITIES.put("∂", "∂"); CODED_ENTITIES.put("∃", "∃"); CODED_ENTITIES.put("∅", "∅"); CODED_ENTITIES.put("∇", "∇"); CODED_ENTITIES.put("∈", "∈"); CODED_ENTITIES.put("∉", "∉"); CODED_ENTITIES.put("∋", "∋"); CODED_ENTITIES.put("∏", "∏"); CODED_ENTITIES.put("∑", "∑"); CODED_ENTITIES.put("−", "−"); CODED_ENTITIES.put("∗", "∗"); CODED_ENTITIES.put("√", "√"); CODED_ENTITIES.put("∝", "∝"); CODED_ENTITIES.put("∞", "∞"); CODED_ENTITIES.put("∠", "∠"); CODED_ENTITIES.put("∧", "∧"); CODED_ENTITIES.put("∨", "∨"); CODED_ENTITIES.put("∩", "∩"); CODED_ENTITIES.put("∪", "∪"); CODED_ENTITIES.put("∫", "∫"); CODED_ENTITIES.put("∴", "∴"); CODED_ENTITIES.put("∼", "∼"); CODED_ENTITIES.put("≅", "≅"); CODED_ENTITIES.put("≈", "≈"); CODED_ENTITIES.put("≠", "≠"); CODED_ENTITIES.put("≡", "≡"); CODED_ENTITIES.put("≤", "≤"); CODED_ENTITIES.put("≥", "≥"); CODED_ENTITIES.put("⊂", "⊂"); CODED_ENTITIES.put("⊃", "⊃"); CODED_ENTITIES.put("⊄", "⊄"); CODED_ENTITIES.put("⊆", "⊆"); CODED_ENTITIES.put("⊇", "⊇"); CODED_ENTITIES.put("⊕", "⊕"); CODED_ENTITIES.put("⊗", "⊗"); CODED_ENTITIES.put("⊥", "⊥"); CODED_ENTITIES.put("⋅", "⋅"); CODED_ENTITIES.put("⌈", "⌈"); CODED_ENTITIES.put("⌉", "⌉"); CODED_ENTITIES.put("⌊", "⌊"); CODED_ENTITIES.put("⌋", "⌋"); CODED_ENTITIES.put("⟨", "〈"); CODED_ENTITIES.put("⟩", "〉"); CODED_ENTITIES.put("◊", "◊"); CODED_ENTITIES.put("♠", "♠"); CODED_ENTITIES.put("♣", "♣"); CODED_ENTITIES.put("♥", "♥"); CODED_ENTITIES.put("♦", "♦"); // Special characters for HTML. // HTMLspecial "-//W3C//ENTITIES Special//EN//HTML" CODED_ENTITIES.put(""", """); CODED_ENTITIES.put("&", "&"); CODED_ENTITIES.put("<", "<"); CODED_ENTITIES.put(">", ">"); CODED_ENTITIES.put("Œ", "Œ"); CODED_ENTITIES.put("œ", "œ"); CODED_ENTITIES.put("Š", "Š"); CODED_ENTITIES.put("š", "š"); CODED_ENTITIES.put("Ÿ", "Ÿ"); CODED_ENTITIES.put("ˆ", "ˆ"); CODED_ENTITIES.put("˜", "˜"); CODED_ENTITIES.put(" ", " "); CODED_ENTITIES.put(" ", " "); CODED_ENTITIES.put(" ", " "); CODED_ENTITIES.put("‌", "‌"); CODED_ENTITIES.put("‍", "‍"); CODED_ENTITIES.put("‎", "‎"); CODED_ENTITIES.put("‏", "‏"); CODED_ENTITIES.put("–", "–"); CODED_ENTITIES.put("—", "—"); CODED_ENTITIES.put("‘", "‘"); CODED_ENTITIES.put("’", "’"); CODED_ENTITIES.put("‚", "‚"); CODED_ENTITIES.put("“", "“"); CODED_ENTITIES.put("”", "”"); CODED_ENTITIES.put("„", "„"); CODED_ENTITIES.put("†", "†"); CODED_ENTITIES.put("‡", "‡"); CODED_ENTITIES.put("‰", "‰"); CODED_ENTITIES.put("‹", "‹"); CODED_ENTITIES.put("›", "›"); CODED_ENTITIES.put("€", "€"); } // // It shouldn't be here but well, just reusing the CODED_ENTITIES Map :) // private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" ); public String processHtmlEntities(String s) { if (s.indexOf('&')==-1) { return s; } StringBuffer sb = new StringBuffer(s.length()); int pos = 0; while (pos<s.length()) { String chunck = s.substring(pos); Matcher m = ENTITIES_PATTERN.matcher(chunck); if (m.find()) { int b = pos + m.start(); int e = pos + m.end(); if (b>pos) { sb.append(s.substring(pos,b)); pos = b; } chunck = s.substring(pos,e); String codedEntity = (String) CODED_ENTITIES.get(chunck); if (codedEntity==null) { codedEntity = chunck; } sb.append(codedEntity); pos = e; } else { sb.append(chunck); pos += chunck.length(); } } return sb.toString(); } }