/*
* Copyright 2005 Sun Microsystems, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.sun.syndication.io.impl;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* @author Alejandro Abdelnur
*/
public class XmlFixerReader extends Reader {
protected Reader in;
public XmlFixerReader(Reader in) {
super(in);
this.in = in;
_buffer = new StringBuffer();
_state = 0;
}
private boolean trimmed;
private StringBuffer _buffer;
private int _bufferPos;
private int _state = 0;
private boolean trimStream() throws IOException {
boolean hasContent = true;
int state = 0;
boolean loop;
int c;
do {
switch (state) {
case 0:
c = in.read();
if (c==-1) {
loop = false;
hasContent = false;
}
else
if (c==' ' || c=='\n') {
loop = true;
}
else
if (c=='<') {
state = 1;
_buffer.setLength(0);
_bufferPos = 0;
_buffer.append((char)c);
loop = true;
}
else {
_buffer.setLength(0);
_bufferPos = 0;
_buffer.append((char)c);
loop = false;
hasContent = true;
_state = 3;
}
break;
case 1:
c = in.read();
if (c==-1) {
loop = false;
hasContent = true;
_state = 3;
}
else
if (c!='!') {
_buffer.append((char)c);
_state = 3;
loop = false;
hasContent = true;
_state = 3;
}
else {
_buffer.append((char)c);
state = 2;
loop = true;
}
break;
case 2:
c = in.read();
if (c==-1) {
loop = false;
hasContent = true;
_state = 3;
}
else
if (c=='-') {
_buffer.append((char)c);
state = 3;
loop = true;
}
else {
_buffer.append((char)c);
loop = false;
hasContent = true;
_state = 3;
}
break;
case 3:
c = in.read();
if (c==-1) {
loop = false;
hasContent = true;
_state = 3;
}
else
if (c=='-') {
_buffer.append((char)c);
state = 4;
loop = true;
}
else {
_buffer.append((char)c);
loop = false;
hasContent = true;
_state = 3;
}
break;
case 4:
c = in.read();
if (c==-1) {
loop = false;
hasContent = true;
_state = 3;
}
else
if (c!='-') {
_buffer.append((char)c);
loop = true;
}
else {
_buffer.append((char)c);
state = 5;
loop = true;
}
break;
case 5:
c = in.read();
if (c==-1) {
loop = false;
hasContent = true;
_state = 3;
}
else
if (c!='-') {
_buffer.append((char)c);
loop = true;
state = 4;
}
else {
_buffer.append((char)c);
state = 6;
loop = true;
}
break;
case 6:
c = in.read();
if (c==-1) {
loop = false;
hasContent = true;
_state = 3;
}
else
if (c!='>') {
_buffer.append((char)c);
loop = true;
state = 4;
}
else {
_buffer.setLength(0);
state = 0;
loop = true;
}
break;
default:
throw new IOException("It shouldn't happen");
}
} while (loop);
return hasContent;
}
public int read() throws IOException {
boolean loop;
if (!trimmed) { // trims XML stream
trimmed = true;
if (!trimStream()) {
return -1;
}
}
int c;
do { // converts literal entities to coded entities
switch (_state) {
case 0: // reading chars from stream
c = in.read();
if (c>-1) {
if (c=='&') {
_state = 1;
_buffer.setLength(0);
_bufferPos = 0;
_buffer.append((char)c);
_state = 1;
loop = true;
}
else {
loop = false;
}
}
else {
loop = false;
}
break;
case 1: // reading entity from stream
c = in.read();
if (c>-1) {
if (c==';') {
_buffer.append((char)c);
_state = 2;
loop = true;
}
else
if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) {
_buffer.append((char)c);
loop = true;
}
else {
// no ';' to match the '&' lets just make the '&'
// a legal xml character entity '&'
_buffer.insert(1, "amp;");
_buffer.append((char)c);
_state = 3;
loop = true;
}
}
else {
// no ';' to match the '&' lets just make the '&'
// a legal xml character entity '&'
_buffer.insert(1, "amp;");
_state = 3;
loop = true;
}
break;
case 2: // replacing entity
c = 0;
String literalEntity = _buffer.toString();
String codedEntity = (String) CODED_ENTITIES.get(literalEntity);
if (codedEntity!=null) {
_buffer.setLength(0);
_buffer.append(codedEntity);
} // else we leave what was in the stream
_state = 3;
loop = true;
break;
case 3: // consuming buffer
if (_bufferPos<_buffer.length()) {
c = _buffer.charAt(_bufferPos++);
loop = false;
}
else {
c = 0;
_state = 0;
loop = true;
}
break;
default:
throw new IOException("It shouldn't happen");
}
} while (loop);
return c;
}
public int read(char[] buffer,int offset,int len) throws IOException {
int charsRead = 0;
int c = read();
if (c==-1) {
return -1;
}
buffer[offset+(charsRead++)] = (char) c;
while (charsRead<len && (c=read())>-1) {
buffer[offset+(charsRead++)] = (char) c;
}
return charsRead;
}
public long skip(long n) throws IOException {
if (n==0) {
return 0;
}
else
if (n<0) {
throw new IllegalArgumentException("'n' cannot be negative");
}
int c = read();
long counter = 1;
while (c>-1 && counter<n) {
c = read();
counter++;
}
return counter;
}
public boolean ready() throws IOException {
return (_state!=0) || in.ready();
}
public boolean markSupported() {
return false;
}
public void mark(int readAheadLimit) throws IOException {
throw new IOException("Stream does not support mark");
}
public void reset() throws IOException {
throw new IOException("Stream does not support mark");
}
public void close() throws IOException {
in.close();
}
private static Map CODED_ENTITIES = new HashMap();
static {
// note: refer to Character entity references in HTML 4
// at http://www.w3.org/TR/REC-html40/sgml/entities.html
// Character entity set.
// HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML"
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put("¡", "¡");
CODED_ENTITIES.put("¢", "¢");
CODED_ENTITIES.put("£", "£");
CODED_ENTITIES.put("¤","¤");
CODED_ENTITIES.put("¥", "¥");
CODED_ENTITIES.put("¦","¦");
CODED_ENTITIES.put("§", "§");
CODED_ENTITIES.put("¨", "¨");
CODED_ENTITIES.put("©", "©");
CODED_ENTITIES.put("ª", "ª");
CODED_ENTITIES.put("«", "«");
CODED_ENTITIES.put("¬", "¬");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("®", "®");
CODED_ENTITIES.put("¯", "¯");
CODED_ENTITIES.put("°", "°");
CODED_ENTITIES.put("±","±");
CODED_ENTITIES.put("²", "²");
CODED_ENTITIES.put("³", "³");
CODED_ENTITIES.put("´", "´");
CODED_ENTITIES.put("µ", "µ");
CODED_ENTITIES.put("¶", "¶");
CODED_ENTITIES.put("·","·");
CODED_ENTITIES.put("¸", "¸");
CODED_ENTITIES.put("¹", "¹");
CODED_ENTITIES.put("º", "º");
CODED_ENTITIES.put("»", "»");
CODED_ENTITIES.put("¼","¼");
CODED_ENTITIES.put("½","½");
CODED_ENTITIES.put("¾","¾");
CODED_ENTITIES.put("¿","¿");
CODED_ENTITIES.put("À","À");
CODED_ENTITIES.put("Á","Á");
CODED_ENTITIES.put("Â", "Â");
CODED_ENTITIES.put("Ã","Ã");
CODED_ENTITIES.put("Ä", "Ä");
CODED_ENTITIES.put("Å", "Å");
CODED_ENTITIES.put("Æ", "Æ");
CODED_ENTITIES.put("Ç","Ç");
CODED_ENTITIES.put("È","È");
CODED_ENTITIES.put("É","É");
CODED_ENTITIES.put("Ê", "Ê");
CODED_ENTITIES.put("Ë", "Ë");
CODED_ENTITIES.put("Ì","Ì");
CODED_ENTITIES.put("Í","Í");
CODED_ENTITIES.put("Î", "Î");
CODED_ENTITIES.put("Ï", "Ï");
CODED_ENTITIES.put("Ð", "Ð");
CODED_ENTITIES.put("Ñ","Ñ");
CODED_ENTITIES.put("Ò","Ò");
CODED_ENTITIES.put("Ó","Ó");
CODED_ENTITIES.put("Ô", "Ô");
CODED_ENTITIES.put("Õ","Õ");
CODED_ENTITIES.put("Ö", "Ö");
CODED_ENTITIES.put("×", "×");
CODED_ENTITIES.put("Ø","Ø");
CODED_ENTITIES.put("Ù","Ù");
CODED_ENTITIES.put("Ú","Ú");
CODED_ENTITIES.put("Û", "Û");
CODED_ENTITIES.put("Ü", "Ü");
CODED_ENTITIES.put("Ý","Ý");
CODED_ENTITIES.put("Þ", "Þ");
CODED_ENTITIES.put("ß", "ß");
CODED_ENTITIES.put("à","à");
CODED_ENTITIES.put("á","á");
CODED_ENTITIES.put("â", "â");
CODED_ENTITIES.put("ã","ã");
CODED_ENTITIES.put("ä", "ä");
CODED_ENTITIES.put("å", "å");
CODED_ENTITIES.put("æ", "æ");
CODED_ENTITIES.put("ç","ç");
CODED_ENTITIES.put("è","è");
CODED_ENTITIES.put("é","é");
CODED_ENTITIES.put("ê", "ê");
CODED_ENTITIES.put("ë", "ë");
CODED_ENTITIES.put("ì","ì");
CODED_ENTITIES.put("í","í");
CODED_ENTITIES.put("î", "î");
CODED_ENTITIES.put("ï", "ï");
CODED_ENTITIES.put("ð", "ð");
CODED_ENTITIES.put("ñ","ñ");
CODED_ENTITIES.put("ò","ò");
CODED_ENTITIES.put("ó","ó");
CODED_ENTITIES.put("ô", "ô");
CODED_ENTITIES.put("õ","õ");
CODED_ENTITIES.put("ö", "ö");
CODED_ENTITIES.put("÷","÷");
CODED_ENTITIES.put("ø","ø");
CODED_ENTITIES.put("ù","ù");
CODED_ENTITIES.put("ú","ú");
CODED_ENTITIES.put("û", "û");
CODED_ENTITIES.put("ü", "ü");
CODED_ENTITIES.put("ý","ý");
CODED_ENTITIES.put("þ", "þ");
CODED_ENTITIES.put("ÿ", "ÿ");
// Mathematical, Greek and Symbolic characters for HTML.
// HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML"
CODED_ENTITIES.put("ƒ", "ƒ");
CODED_ENTITIES.put("Α", "Α");
CODED_ENTITIES.put("Β", "Β");
CODED_ENTITIES.put("Γ", "Γ");
CODED_ENTITIES.put("Δ", "Δ");
CODED_ENTITIES.put("Ε", "Ε");
CODED_ENTITIES.put("Ζ", "Ζ");
CODED_ENTITIES.put("Η", "Η");
CODED_ENTITIES.put("Θ", "Θ");
CODED_ENTITIES.put("Ι", "Ι");
CODED_ENTITIES.put("Κ", "Κ");
CODED_ENTITIES.put("Λ", "Λ");
CODED_ENTITIES.put("Μ", "Μ");
CODED_ENTITIES.put("Ν", "Ν");
CODED_ENTITIES.put("Ξ", "Ξ");
CODED_ENTITIES.put("Ο", "Ο");
CODED_ENTITIES.put("Π", "Π");
CODED_ENTITIES.put("Ρ", "Ρ");
CODED_ENTITIES.put("Σ", "Σ");
CODED_ENTITIES.put("Τ", "Τ");
CODED_ENTITIES.put("Υ", "Υ");
CODED_ENTITIES.put("Φ", "Φ");
CODED_ENTITIES.put("Χ", "Χ");
CODED_ENTITIES.put("Ψ", "Ψ");
CODED_ENTITIES.put("Ω", "Ω");
CODED_ENTITIES.put("α", "α");
CODED_ENTITIES.put("β", "β");
CODED_ENTITIES.put("γ", "γ");
CODED_ENTITIES.put("δ", "δ");
CODED_ENTITIES.put("ε", "ε");
CODED_ENTITIES.put("ζ", "ζ");
CODED_ENTITIES.put("η", "η");
CODED_ENTITIES.put("θ", "θ");
CODED_ENTITIES.put("ι", "ι");
CODED_ENTITIES.put("κ", "κ");
CODED_ENTITIES.put("λ", "λ");
CODED_ENTITIES.put("μ", "μ");
CODED_ENTITIES.put("ν", "ν");
CODED_ENTITIES.put("ξ", "ξ");
CODED_ENTITIES.put("ο", "ο");
CODED_ENTITIES.put("π", "π");
CODED_ENTITIES.put("ρ", "ρ");
CODED_ENTITIES.put("ς", "ς");
CODED_ENTITIES.put("σ", "σ");
CODED_ENTITIES.put("τ", "τ");
CODED_ENTITIES.put("υ", "υ");
CODED_ENTITIES.put("φ", "φ");
CODED_ENTITIES.put("χ", "χ");
CODED_ENTITIES.put("ψ", "ψ");
CODED_ENTITIES.put("ω", "ω");
CODED_ENTITIES.put("ϑ", "ϑ");
CODED_ENTITIES.put("ϒ", "ϒ");
CODED_ENTITIES.put("ϖ", "ϖ");
CODED_ENTITIES.put("•", "•");
CODED_ENTITIES.put("…", "…");
CODED_ENTITIES.put("′", "′");
CODED_ENTITIES.put("″", "″");
CODED_ENTITIES.put("‾", "‾");
CODED_ENTITIES.put("⁄", "⁄");
CODED_ENTITIES.put("℘", "℘");
CODED_ENTITIES.put("ℑ", "ℑ");
CODED_ENTITIES.put("ℜ", "ℜ");
CODED_ENTITIES.put("™", "™");
CODED_ENTITIES.put("ℵ", "ℵ");
CODED_ENTITIES.put("←", "←");
CODED_ENTITIES.put("↑", "↑");
CODED_ENTITIES.put("→", "→");
CODED_ENTITIES.put("↓", "↓");
CODED_ENTITIES.put("↔", "↔");
CODED_ENTITIES.put("↵", "↵");
CODED_ENTITIES.put("⇐", "⇐");
CODED_ENTITIES.put("⇑", "⇑");
CODED_ENTITIES.put("⇒", "⇒");
CODED_ENTITIES.put("⇓", "⇓");
CODED_ENTITIES.put("⇔", "⇔");
CODED_ENTITIES.put("∀", "∀");
CODED_ENTITIES.put("∂", "∂");
CODED_ENTITIES.put("∃", "∃");
CODED_ENTITIES.put("∅", "∅");
CODED_ENTITIES.put("∇", "∇");
CODED_ENTITIES.put("∈", "∈");
CODED_ENTITIES.put("∉", "∉");
CODED_ENTITIES.put("∋", "∋");
CODED_ENTITIES.put("∏", "∏");
CODED_ENTITIES.put("∑", "∑");
CODED_ENTITIES.put("−", "−");
CODED_ENTITIES.put("∗", "∗");
CODED_ENTITIES.put("√", "√");
CODED_ENTITIES.put("∝", "∝");
CODED_ENTITIES.put("∞", "∞");
CODED_ENTITIES.put("∠", "∠");
CODED_ENTITIES.put("∧", "∧");
CODED_ENTITIES.put("∨", "∨");
CODED_ENTITIES.put("∩", "∩");
CODED_ENTITIES.put("∪", "∪");
CODED_ENTITIES.put("∫", "∫");
CODED_ENTITIES.put("∴", "∴");
CODED_ENTITIES.put("∼", "∼");
CODED_ENTITIES.put("≅", "≅");
CODED_ENTITIES.put("≈", "≈");
CODED_ENTITIES.put("≠", "≠");
CODED_ENTITIES.put("≡", "≡");
CODED_ENTITIES.put("≤", "≤");
CODED_ENTITIES.put("≥", "≥");
CODED_ENTITIES.put("⊂", "⊂");
CODED_ENTITIES.put("⊃", "⊃");
CODED_ENTITIES.put("⊄", "⊄");
CODED_ENTITIES.put("⊆", "⊆");
CODED_ENTITIES.put("⊇", "⊇");
CODED_ENTITIES.put("⊕", "⊕");
CODED_ENTITIES.put("⊗", "⊗");
CODED_ENTITIES.put("⊥", "⊥");
CODED_ENTITIES.put("⋅", "⋅");
CODED_ENTITIES.put("⌈", "⌈");
CODED_ENTITIES.put("⌉", "⌉");
CODED_ENTITIES.put("⌊", "⌊");
CODED_ENTITIES.put("⌋", "⌋");
CODED_ENTITIES.put("〈", "〈");
CODED_ENTITIES.put("〉", "〉");
CODED_ENTITIES.put("◊", "◊");
CODED_ENTITIES.put("♠", "♠");
CODED_ENTITIES.put("♣", "♣");
CODED_ENTITIES.put("♥", "♥");
CODED_ENTITIES.put("♦", "♦");
// Special characters for HTML.
// HTMLspecial "-//W3C//ENTITIES Special//EN//HTML"
CODED_ENTITIES.put(""", """);
CODED_ENTITIES.put("&", "&");
CODED_ENTITIES.put("<", "<");
CODED_ENTITIES.put(">", ">");
CODED_ENTITIES.put("Œ", "Œ");
CODED_ENTITIES.put("œ", "œ");
CODED_ENTITIES.put("Š", "Š");
CODED_ENTITIES.put("š", "š");
CODED_ENTITIES.put("Ÿ", "Ÿ");
CODED_ENTITIES.put("ˆ", "ˆ");
CODED_ENTITIES.put("˜", "˜");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put(" ", " ");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("", "");
CODED_ENTITIES.put("–", "–");
CODED_ENTITIES.put("—", "—");
CODED_ENTITIES.put("‘", "‘");
CODED_ENTITIES.put("’", "’");
CODED_ENTITIES.put("‚", "‚");
CODED_ENTITIES.put("“", "“");
CODED_ENTITIES.put("”", "”");
CODED_ENTITIES.put("„", "„");
CODED_ENTITIES.put("†", "†");
CODED_ENTITIES.put("‡", "‡");
CODED_ENTITIES.put("‰", "‰");
CODED_ENTITIES.put("‹", "‹");
CODED_ENTITIES.put("›", "›");
CODED_ENTITIES.put("€", "€");
}
//
// It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
//
private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" );
public String processHtmlEntities(String s) {
if (s.indexOf('&')==-1) {
return s;
}
StringBuffer sb = new StringBuffer(s.length());
int pos = 0;
while (pos<s.length()) {
String chunck = s.substring(pos);
Matcher m = ENTITIES_PATTERN.matcher(chunck);
if (m.find()) {
int b = pos + m.start();
int e = pos + m.end();
if (b>pos) {
sb.append(s.substring(pos,b));
pos = b;
}
chunck = s.substring(pos,e);
String codedEntity = (String) CODED_ENTITIES.get(chunck);
if (codedEntity==null) {
codedEntity = chunck;
}
sb.append(codedEntity);
pos = e;
}
else {
sb.append(chunck);
pos += chunck.length();
}
}
return sb.toString();
}
}