package org.orbeon.saxon.event; import org.orbeon.saxon.tinytree.CompressedWhitespace; import org.orbeon.saxon.trans.XPathException; public class HTML1252Emitter extends HTMLEmitter { public static char[] CP1252_BEST_FIT = new char[] { '\u007f', // Delete '\u20ac', // Euro Sign '\u0081', '\u201a', // Single Low-9 Quotation Mark '\u0192', // Latin Small Letter F With Hook '\u201e', // Double Low-9 Quotation Mark '\u2026', // Horizontal Ellipsis '\u2020', // Dagger '\u2021', // Double Dagger '\u02c6', // Modifier Letter Circumflex Accent '\u2030', // Per Mille Sign '\u0160', // Latin Capital Letter S With Caron '\u2039', // Single Left-Pointing Angle Quotation Mark '\u0152', // Latin Capital Ligature Oe '\u008d', '\u017d', // Latin Capital Letter Z With Caron '\u008f', '\u0090', '\u2018', // Left Single Quotation Mark '\u2019', // Right Single Quotation Mark '\u201c', // Left Double Quotation Mark '\u201d', // Right Double Quotation Mark '\u2022', // Bullet '\u2013', // En Dash '\u2014', // Em Dash '\u02dc', // Small Tilde '\u2122', // Trade Mark Sign '\u0161', // Latin Small Letter S With Caron '\u203a', // Single Right-Pointing Angle Quotation Mark '\u0153', // Latin Small Ligature Oe '\u009d', '\u017e', // Latin Small Letter Z With Caron '\u0178' // Latin Capital Letter Y With Diaeresis }; // This method overrides the Saxon method so we can do a custom fix-up of invalid CP-1252 characters. @Override protected void writeEscape(final CharSequence chars, final boolean inAttribute) throws java.io.IOException, XPathException { int segstart = 0; final boolean[] specialChars = (inAttribute ? specialInAtt : specialInText); if (chars instanceof CompressedWhitespace) { ((CompressedWhitespace)chars).writeEscape(specialChars, writer); return; } boolean disabled = false; while (segstart < chars.length()) { int i = segstart; // find a maximal sequence of "ordinary" characters if (nonASCIIRepresentation == REP_NATIVE) { char c; while (i < chars.length() && ((c = chars.charAt(i)) < 127 ? !specialChars[c] : (characterSet.inCharset(c) && c > 160) ) ) { i++; } } else { char c; while (i < chars.length() && (c = chars.charAt(i)) < 127 && !specialChars[c]) { i++; } } // if this was the whole string, output the string and quit if (i == chars.length()) { if (segstart == 0) { writeCharSequence(chars); } else { writeCharSequence(chars.subSequence(segstart, i)); } return; } // otherwise, output this sequence and continue if (i > segstart) { writeCharSequence(chars.subSequence(segstart, i)); } final char c = chars.charAt(i); if (c==0) { // used to switch escaping on and off // See https://github.com/orbeon/orbeon-forms/issues/3115 // disabled = !disabled; } else if (disabled) { writer.write(c); } else if (c<=127) { // handle a special ASCII character if (inAttribute) { if (c=='<') { writer.write('<'); // not escaped } else if (c=='>') { writer.write(">"); // recommended for older browsers } else if (c=='&') { if (i+1<chars.length() && chars.charAt(i+1)=='{') { writer.write('&'); // not escaped if followed by '{' } else { writer.write("&"); } } else if (c=='\"') { writer.write("""); } else if (c=='\n') { writer.write(" "); } else if (c=='\t') { writer.write(" "); } else if (c=='\r') { writer.write(" "); } } else { if (c=='<') { writer.write("<"); } else if (c=='>') { writer.write(">"); // changed to allow for "]]>" } else if (c=='&') { writer.write("&"); } else if (c=='\r') { writer.write(" "); } } } else if (c==160) { // always output NBSP as an entity reference writer.write(" "); } else if (c>=127 && c<160) { // these control characters are illegal in HTML // ORBEON: Handle faulty CP-1252 characters ending up as unicode code points. outputCharacterReference(CP1252_BEST_FIT[c - 127]); } else if (c>=55296 && c<=56319) { //handle surrogate pair //A surrogate pair is two consecutive Unicode characters. The first //is in the range D800 to DBFF, the second is in the range DC00 to DFFF. //To compute the numeric value of the character corresponding to a surrogate //pair, use this formula (all numbers are hex): //(FirstChar - D800) * 400 + (SecondChar - DC00) + 10000 // we'll trust the data to be sound int charval = (((int)c - 55296) * 1024) + ((int)chars.charAt(i+1) - 56320) + 65536; outputCharacterReference(charval); i++; } else if (characterSet.inCharset(c)) { switch(nonASCIIRepresentation) { case REP_NATIVE: writer.write(c); break; case REP_ENTITY: if (c>160 && c<=255) { // if chararacter in iso-8859-1, use an entity reference writer.write('&'); writer.write(latin1Entities[(int)c-160]); writer.write(';'); break; } // else fall through case REP_DECIMAL: preferHex = false; outputCharacterReference(c); break; case REP_HEX: preferHex = true; // fall through default: outputCharacterReference(c); break; } } else { // Character not present in encoding switch(excludedRepresentation) { case REP_ENTITY: if (c>160 && c<=255) { // if chararacter in iso-8859-1, use an entity reference writer.write('&'); writer.write(latin1Entities[(int)c-160]); writer.write(';'); break; } // else fall through case REP_NATIVE: case REP_DECIMAL: preferHex = false; outputCharacterReference(c); break; case REP_HEX: preferHex = true; // fall through default: outputCharacterReference(c); break; } } segstart = ++i; } } } // // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License"); // you may not use this file except in compliance with the License. You may obtain a copy of the // License at http://www.mozilla.org/MPL/ // // Software distributed under the License is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the License for the specific language governing rights and limitations under the License. // // The Original Code is: all this file. // // The Initial Developer of the Original Code is Michael H. Kay. // // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved. // // Contributor(s): none. //