/* * ============================================================================= * * Copyright (c) 2014, The UNBESCAPE team (http://www.unbescape.org) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ============================================================================= */ package org.zkoss.lang; import java.io.IOException; import java.io.Writer; import java.util.Arrays; /** * <p> * Internal class in charge of performing the real escape/unescape operations. * </p> * * @author Daniel Fernández * * @since 1.0.0 * */ /**package*/ final class JavaScriptEscapeUtil { /* * JAVASCRIPT ESCAPE/UNESCAPE OPERATIONS * ------------------------------------- * * See: http://www.ecmascript.org/docs.php * http://mathiasbynens.be/notes/javascript-escapes * * (Note that, in the following examples, and in order to avoid escape problems during the compilation * of this class, the backslash symbol is replaced by '%') * * - SINGLE ESCAPE CHARACTERS (SECs): * U+0000 -> %0 * U+0008 -> %b * U+0009 -> %t * U+000A -> %n * U+000B -> %v [NOT USED IN ESCAPE - Not supported by Internet Explorer < 9] * U+000C -> %f * U+000D -> %r * U+0022 -> %" * U+0027 -> %' * U+005C -> %% * U+002F -> %/ [ONLY USED WHEN / APPEARS IN </, IN ORDER TO AVOID ISSUES INSIDE <script> TAGS] * - HEXADECIMAL ESCAPE [XHEXA] (only for characters <= U+00FF): %x?? * - UNICODE ESCAPE [UHEXA] (also hexadecimal) * Characters <= U+FFFF: %u???? * Characters > U+FFFF : %u????%u???? (surrogate character pair) * %u{?*} [NOT USED - Possible syntax for ECMAScript 6] * - OCTAL ESCAPE: %377 [NOT USED IN ESCAPE - Deprecated in ECMAScript v5] * - GENERAL ESCAPE: %* -> * ('%a' -> 'a') * (except the [%,%n] sequence, which is not an escape sequence but a line continuation) * */ /* * Prefixes defined for use in escape and unescape operations */ private static final char ESCAPE_PREFIX = '\\'; private static final char ESCAPE_XHEXA_PREFIX2 = 'x'; private static final char ESCAPE_UHEXA_PREFIX2 = 'u'; private static final char[] ESCAPE_XHEXA_PREFIX = "\\x".toCharArray(); private static final char[] ESCAPE_UHEXA_PREFIX = "\\u".toCharArray(); /* * Small utility char arrays for hexadecimal conversion. */ private static char[] HEXA_CHARS_UPPER = "0123456789ABCDEF".toCharArray(); private static char[] HEXA_CHARS_LOWER = "0123456789abcdef".toCharArray(); /* * Structures for holding the Single Escape Characters */ private static int SEC_CHARS_LEN = '\\' + 1; // 0x5C + 1 = 0x5D private static char SEC_CHARS_NO_SEC = '*'; private static char[] SEC_CHARS; /* * Structured for holding the 'escape level' assigned to chars (not codepoints) up to ESCAPE_LEVELS_LEN. * - The last position of the ESCAPE_LEVELS array will be used for determining the level of all * codepoints >= (ESCAPE_LEVELS_LEN - 1) */ private static final char ESCAPE_LEVELS_LEN = 0x9f + 2; // Last relevant char to be indexed is 0x9f private static final byte[] ESCAPE_LEVELS; static { /* * Initialize Single Escape Characters */ SEC_CHARS = new char[SEC_CHARS_LEN]; Arrays.fill(SEC_CHARS,SEC_CHARS_NO_SEC); SEC_CHARS[0x00] = '0'; SEC_CHARS[0x08] = 'b'; SEC_CHARS[0x09] = 't'; SEC_CHARS[0x0A] = 'n'; SEC_CHARS[0x0C] = 'f'; SEC_CHARS[0x0D] = 'r'; SEC_CHARS[0x22] = '"'; SEC_CHARS[0x27] = '\''; SEC_CHARS[0x5C] = '\\'; // slash (solidus) character: will only be escaped if in '</' SEC_CHARS[0x2F] = '/'; /* * Initialization of escape levels. * Defined levels : * * - Level 1 : Basic escape set * - Level 2 : Basic escape set plus all non-ASCII * - Level 3 : All non-alphanumeric characters * - Level 4 : All characters * */ ESCAPE_LEVELS = new byte[ESCAPE_LEVELS_LEN]; /* * Everything is level 3 unless contrary indication. */ Arrays.fill(ESCAPE_LEVELS, (byte)3); /* * Everything non-ASCII is level 2 unless contrary indication. */ for (char c = 0x80; c < ESCAPE_LEVELS_LEN; c++) { ESCAPE_LEVELS[c] = 2; } /* * Alphanumeric characters are level 4. */ for (char c = 'A'; c <= 'Z'; c++) { ESCAPE_LEVELS[c] = 4; } for (char c = 'a'; c <= 'z'; c++) { ESCAPE_LEVELS[c] = 4; } for (char c = '0'; c <= '9'; c++) { ESCAPE_LEVELS[c] = 4; } /* * Simple Escape Character will be level 1 (always escaped) */ ESCAPE_LEVELS[0x00] = 1; ESCAPE_LEVELS[0x08] = 1; ESCAPE_LEVELS[0x09] = 1; ESCAPE_LEVELS[0x0A] = 1; ESCAPE_LEVELS[0x0C] = 1; ESCAPE_LEVELS[0x0D] = 1; ESCAPE_LEVELS[0x22] = 1; ESCAPE_LEVELS[0x27] = 1; ESCAPE_LEVELS[0x5C] = 1; // slash (solidus) character: will only be escaped if in '</', but we signal it as level 1 anyway ESCAPE_LEVELS[0x2F] = 1; /* * JavaScript defines two ranges of non-displayable, control characters (some of which are already part of the * Single Escape Characters list): U+0001 to U+001F and U+007F to U+009F. */ for (char c = 0x01; c <= 0x1F; c++) { ESCAPE_LEVELS[c] = 1; } for (char c = 0x7F; c <= 0x9F; c++) { ESCAPE_LEVELS[c] = 1; } } private JavaScriptEscapeUtil() { super(); } static char[] toXHexa(final int codepoint) { final char[] result = new char[2]; result[1] = HEXA_CHARS_UPPER[codepoint % 0x10]; result[0] = HEXA_CHARS_UPPER[(codepoint >>> 4) % 0x10]; return result; } static char[] toUHexa(final int codepoint) { final char[] result = new char[4]; result[3] = HEXA_CHARS_UPPER[codepoint % 0x10]; result[2] = HEXA_CHARS_UPPER[(codepoint >>> 4) % 0x10]; result[1] = HEXA_CHARS_UPPER[(codepoint >>> 8) % 0x10]; result[0] = HEXA_CHARS_UPPER[(codepoint >>> 12) % 0x10]; return result; } /* * Perform an escape operation, based on String, according to the specified level and type. */ static String escape(final String text, final JavaScriptEscapeType escapeType, final JavaScriptEscapeLevel escapeLevel) { if (text == null) { return null; } final int level = escapeLevel.getEscapeLevel(); final boolean useSECs = escapeType.getUseSECs(); final boolean useXHexa = escapeType.getUseXHexa(); StringBuilder strBuilder = null; final int offset = 0; final int max = text.length(); int readOffset = offset; for (int i = offset; i < max; i++) { final int codepoint = Character.codePointAt(text, i); /* * Shortcut: most characters will be ASCII/Alphanumeric, and we won't need to do anything at * all for them */ if (codepoint <= (ESCAPE_LEVELS_LEN - 2) && level < ESCAPE_LEVELS[codepoint]) { continue; } /* * Check whether the character is a slash (solidus). In such case, only escape if it * appears after a '<' ('</') or level >= 3 (non alphanumeric) */ if (codepoint == '/' && level < 3 && (i == 0 || text.charAt(i - 1) != '<')) { continue; } /* * Shortcut: we might not want to escape non-ASCII chars at all either. * We also check we are not dealing with U+2028 or U+2029, which the JavaScript spec considers * LineTerminators and therefore should be escaped always. */ if (codepoint > (ESCAPE_LEVELS_LEN - 2) && level < ESCAPE_LEVELS[ESCAPE_LEVELS_LEN - 1] && codepoint != '\u2028' && codepoint != '\u2029') { if (Character.charCount(codepoint) > 1) { // This is to compensate that we are actually escaping two char[] positions with a single codepoint. i++; } continue; } /* * At this point we know for sure we will need some kind of escape, so we * can increase the offset and initialize the string builder if needed, along with * copying to it all the contents pending up to this point. */ if (strBuilder == null) { strBuilder = new StringBuilder(max + 20); } if (i - readOffset > 0) { strBuilder.append(text, readOffset, i); } if (Character.charCount(codepoint) > 1) { // This is to compensate that we are actually reading two char[] positions with a single codepoint. i++; } readOffset = i + 1; /* * ----------------------------------------------------------------------------------------- * * Peform the real escape, attending the different combinations of SECs, XHEXA and UHEXA * * ----------------------------------------------------------------------------------------- */ if (useSECs && codepoint < SEC_CHARS_LEN) { // We will try to use a SEC final char sec = SEC_CHARS[codepoint]; if (sec != SEC_CHARS_NO_SEC) { // SEC found! just write it and go for the next char strBuilder.append(ESCAPE_PREFIX); strBuilder.append(sec); continue; } } /* * No SEC-escape was possible, so we need xhexa/uhexa escape. */ if (useXHexa && codepoint <= 0xFF) { // Codepoint is <= 0xFF, so we can use XHEXA escapes strBuilder.append(ESCAPE_XHEXA_PREFIX); strBuilder.append(toXHexa(codepoint)); continue; } if (Character.charCount(codepoint) > 1) { final char[] codepointChars = Character.toChars(codepoint); strBuilder.append(ESCAPE_UHEXA_PREFIX); strBuilder.append(toUHexa(codepointChars[0])); strBuilder.append(ESCAPE_UHEXA_PREFIX); strBuilder.append(toUHexa(codepointChars[1])); continue; } strBuilder.append(ESCAPE_UHEXA_PREFIX); strBuilder.append(toUHexa(codepoint)); } /* * ----------------------------------------------------------------------------------------------- * Final cleaning: return the original String object if no escape was actually needed. Otherwise * append the remaining unescaped text to the string builder and return. * ----------------------------------------------------------------------------------------------- */ if (strBuilder == null) { return text; } if (max - readOffset > 0) { strBuilder.append(text, readOffset, max); } return strBuilder.toString(); } /* * Perform an escape operation, based on char[], according to the specified level and type. */ static void escape(final char[] text, final int offset, final int len, final Writer writer, final JavaScriptEscapeType escapeType, final JavaScriptEscapeLevel escapeLevel) throws IOException { if (text == null || text.length == 0) { return; } final int level = escapeLevel.getEscapeLevel(); final boolean useSECs = escapeType.getUseSECs(); final boolean useXHexa = escapeType.getUseXHexa(); final int max = (offset + len); int readOffset = offset; for (int i = offset; i < max; i++) { final int codepoint = Character.codePointAt(text, i); /* * Shortcut: most characters will be ASCII/Alphanumeric, and we won't need to do anything at * all for them */ if (codepoint <= (ESCAPE_LEVELS_LEN - 2) && level < ESCAPE_LEVELS[codepoint]) { continue; } /* * Check whether the character is a slash (solidus). In such case, only escape if it * appears after a '<' ('</') or level >= 3 (non alphanumeric) */ if (codepoint == '/' && level < 3 && (i == 0 || text[i - 1] != '<')) { continue; } /* * Shortcut: we might not want to escape non-ASCII chars at all either. * We also check we are not dealing with U+2028 or U+2029, which the JavaScript spec considers * LineTerminators and therefore should be escaped always. */ if (codepoint > (ESCAPE_LEVELS_LEN - 2) && level < ESCAPE_LEVELS[ESCAPE_LEVELS_LEN - 1] && codepoint != '\u2028' && codepoint != '\u2029') { if (Character.charCount(codepoint) > 1) { // This is to compensate that we are actually escaping two char[] positions with a single codepoint. i++; } continue; } /* * At this point we know for sure we will need some kind of escape, so we * can write all the contents pending up to this point. */ if (i - readOffset > 0) { writer.write(text, readOffset, (i - readOffset)); } if (Character.charCount(codepoint) > 1) { // This is to compensate that we are actually reading two char[] positions with a single codepoint. i++; } readOffset = i + 1; /* * ----------------------------------------------------------------------------------------- * * Peform the real escape, attending the different combinations of SECs, XHEXA and UHEXA * * ----------------------------------------------------------------------------------------- */ if (useSECs && codepoint < SEC_CHARS_LEN) { // We will try to use a SEC final char sec = SEC_CHARS[codepoint]; if (sec != SEC_CHARS_NO_SEC) { // SEC found! just write it and go for the next char writer.write(ESCAPE_PREFIX); writer.write(sec); continue; } } /* * No SEC-escape was possible, so we need xhexa/uhexa escape. */ if (useXHexa && codepoint <= 0xFF) { // Codepoint is <= 0xFF, so we can use XHEXA escapes writer.write(ESCAPE_XHEXA_PREFIX); writer.write(toXHexa(codepoint)); continue; } if (Character.charCount(codepoint) > 1) { final char[] codepointChars = Character.toChars(codepoint); writer.write(ESCAPE_UHEXA_PREFIX); writer.write(toUHexa(codepointChars[0])); writer.write(ESCAPE_UHEXA_PREFIX); writer.write(toUHexa(codepointChars[1])); continue; } writer.write(ESCAPE_UHEXA_PREFIX); writer.write(toUHexa(codepoint)); } /* * ----------------------------------------------------------------------------------------------- * Final cleaning: return the original String object if no escape was actually needed. Otherwise * append the remaining unescaped text to the string builder and return. * ----------------------------------------------------------------------------------------------- */ if (max - readOffset > 0) { writer.write(text, readOffset, (max - readOffset)); } } /* * This methods (the two versions) are used instead of Integer.parseInt(str,radix) in order to avoid the need * to create substrings of the text being unescaped to feed such method. * - No need to check all chars are within the radix limits - reference parsing code will already have done so. */ static int parseIntFromReference(final String text, final int start, final int end, final int radix) { int result = 0; for (int i = start; i < end; i++) { final char c = text.charAt(i); int n = -1; for (int j = 0; j < HEXA_CHARS_UPPER.length; j++) { if (c == HEXA_CHARS_UPPER[j] || c == HEXA_CHARS_LOWER[j]) { n = j; break; } } result = (radix * result) + n; } return result; } static int parseIntFromReference(final char[] text, final int start, final int end, final int radix) { int result = 0; for (int i = start; i < end; i++) { final char c = text[i]; int n = -1; for (int j = 0; j < HEXA_CHARS_UPPER.length; j++) { if (c == HEXA_CHARS_UPPER[j] || c == HEXA_CHARS_LOWER[j]) { n = j; break; } } result = (radix * result) + n; } return result; } static boolean isOctalEscape(final String text, final int start, final int end) { if (start >= end) { return false; } final char c1 = text.charAt(start); if (c1 < '0' || c1 > '7') { return false; } if (start + 1 >= end) { return (c1 != '0'); // It would not be an octal escape, but the U+0000 escape sequence. } final char c2 = text.charAt(start + 1); if (c2 < '0' || c2 > '7') { return (c1 != '0'); // It would not be an octal escape, but the U+0000 escape sequence. } if (start + 2 >= end) { return (c1 != '0' || c2 != '0'); // It would not be an octal escape, but the U+0000 escape sequence + '0'. } final char c3 = text.charAt(start + 2); if (c3 < '0' || c3 > '7') { return (c1 != '0' || c2 != '0'); // It would not be an octal escape, but the U+0000 escape sequence + '0'. } return (c1 != '0' || c2 != '0' || c3 != '0'); // Check it's not U+0000 (escaped) + '00' } static boolean isOctalEscape(final char[] text, final int start, final int end) { if (start >= end) { return false; } final char c1 = text[start]; if (c1 < '0' || c1 > '7') { return false; } if (start + 1 >= end) { return (c1 != '0'); // It would not be an octal escape, but the U+0000 escape sequence. } final char c2 = text[start + 1]; if (c2 < '0' || c2 > '7') { return (c1 != '0'); // It would not be an octal escape, but the U+0000 escape sequence. } if (start + 2 >= end) { return (c1 != '0' || c2 != '0'); // It would not be an octal escape, but the U+0000 escape sequence + '0'. } final char c3 = text[start + 2]; if (c3 < '0' || c3 > '7') { return (c1 != '0' || c2 != '0'); // It would not be an octal escape, but the U+0000 escape sequence + '0'. } return (c1 != '0' || c2 != '0' || c3 != '0'); // Check it's not U+0000 (escaped) + '00' } /* * Perform an unescape operation based on String. */ static String unescape(final String text) { if (text == null) { return null; } StringBuilder strBuilder = null; final int offset = 0; final int max = text.length(); int readOffset = offset; int referenceOffset = offset; for (int i = offset; i < max; i++) { final char c = text.charAt(i); /* * Check the need for an unescape operation at this point */ if (c != ESCAPE_PREFIX || (i + 1) >= max) { continue; } int codepoint = -1; if (c == ESCAPE_PREFIX) { final char c1 = text.charAt(i + 1); switch (c1) { case '0': if (!isOctalEscape(text,i + 1,max)) { codepoint = 0x00; referenceOffset = i + 1; }; break; case 'b': codepoint = 0x08; referenceOffset = i + 1; break; case 't': codepoint = 0x09; referenceOffset = i + 1; break; case 'n': codepoint = 0x0A; referenceOffset = i + 1; break; case 'v': codepoint = 0x0B; referenceOffset = i + 1; break; case 'f': codepoint = 0x0C; referenceOffset = i + 1; break; case 'r': codepoint = 0x0D; referenceOffset = i + 1; break; case '"': codepoint = 0x22; referenceOffset = i + 1; break; case '\'': codepoint = 0x27; referenceOffset = i + 1; break; case '\\': codepoint = 0x5C; referenceOffset = i + 1; break; case '/': codepoint = 0x2F; referenceOffset = i + 1; break; } if (codepoint == -1) { if (c1 == ESCAPE_XHEXA_PREFIX2) { // This can be a xhexa escape, we need exactly two more characters int f = i + 2; while (f < (i + 4) && f < max) { final char cf = text.charAt(f); if (!((cf >= '0' && cf <= '9') || (cf >= 'A' && cf <= 'F') || (cf >= 'a' && cf <= 'f'))) { break; } f++; } if ((f - (i + 2)) < 2) { // We weren't able to consume the required two hexa chars, leave it as slash+'x', which // is invalid, and let the corresponding JavaScript engine fail. i++; continue; } codepoint = parseIntFromReference(text, i + 2, f, 16); // Fast-forward to the first char after the parsed codepoint referenceOffset = f - 1; // Don't continue here, just let the unescape code below do its job } else if (c1 == ESCAPE_UHEXA_PREFIX2) { // This can be a uhexa escape, we need exactly four more characters int f = i + 2; while (f < (i + 6) && f < max) { final char cf = text.charAt(f); if (!((cf >= '0' && cf <= '9') || (cf >= 'A' && cf <= 'F') || (cf >= 'a' && cf <= 'f'))) { break; } f++; } if ((f - (i + 2)) < 4) { // We weren't able to consume the required four hexa chars, leave it as slash+'u', which // is invalid, and let the corresponding JavaScript engine fail. i++; continue; } codepoint = parseIntFromReference(text, i + 2, f, 16); // Fast-forward to the first char after the parsed codepoint referenceOffset = f - 1; // Don't continue here, just let the unescape code below do its job } else if (c1 >= '0' && c1 <= '7') { // This can be a octal escape, we need at least 1 more char, and up to 3 more. int f = i + 2; while (f < (i + 4) && f < max) { // We need only a max of two more chars final char cf = text.charAt(f); if (!(cf >= '0' && cf <= '7')) { break; } f++; } codepoint = parseIntFromReference(text, i + 1, f, 8); if (codepoint > 0xFF) { // Maximum octal escape char is FF. Ignore the last digit codepoint = parseIntFromReference(text, i + 1, f - 1, 8); referenceOffset = f - 2; } else { referenceOffset = f - 1; } // Don't continue here, just let the unescape code below do its job } else if (c1 == '8' || c1 == '9' || c1 == '\n' || c1 == '\r' || c1 == '\u2028' || c1 == '\u2029') { // '8' and '9' are not valid octal escape sequences, and the other four characters // are LineTerminators, which are not allowed as escape sequences. So we leave it as is // and expect the corresponding JavaScript engine to fail (except in the case of slash + '\n', // which is considered a LineContinuator). i++; continue; } else { // We weren't able to consume any valid escape chars, just consider it a normal char, // which is allowed by the JavaScript specification (NonEscapeCharacter) codepoint = (int) c1; referenceOffset = i + 1; } } } /* * At this point we know for sure we will need some kind of unescape, so we * can increase the offset and initialize the string builder if needed, along with * copying to it all the contents pending up to this point. */ if (strBuilder == null) { strBuilder = new StringBuilder(max + 5); } if (i - readOffset > 0) { strBuilder.append(text, readOffset, i); } i = referenceOffset; readOffset = i + 1; /* * -------------------------- * * Peform the real unescape * * -------------------------- */ if (codepoint > '\uFFFF') { strBuilder.append(Character.toChars(codepoint)); } else { strBuilder.append((char)codepoint); } } /* * ----------------------------------------------------------------------------------------------- * Final cleaning: return the original String object if no unescape was actually needed. Otherwise * append the remaining escaped text to the string builder and return. * ----------------------------------------------------------------------------------------------- */ if (strBuilder == null) { return text; } if (max - readOffset > 0) { strBuilder.append(text, readOffset, max); } return strBuilder.toString(); } /* * Perform an unescape operation based on char[]. */ static void unescape(final char[] text, final int offset, final int len, final Writer writer) throws IOException { if (text == null) { return; } final int max = (offset + len); int readOffset = offset; int referenceOffset = offset; for (int i = offset; i < max; i++) { final char c = text[i]; /* * Check the need for an unescape operation at this point */ if (c != ESCAPE_PREFIX || (i + 1) >= max) { continue; } int codepoint = -1; if (c == ESCAPE_PREFIX) { final char c1 = text[i + 1]; switch (c1) { case '0': if (!isOctalEscape(text,i + 1,max)) { codepoint = 0x00; referenceOffset = i + 1; }; break; case 'b': codepoint = 0x08; referenceOffset = i + 1; break; case 't': codepoint = 0x09; referenceOffset = i + 1; break; case 'n': codepoint = 0x0A; referenceOffset = i + 1; break; case 'v': codepoint = 0x0B; referenceOffset = i + 1; break; case 'f': codepoint = 0x0C; referenceOffset = i + 1; break; case 'r': codepoint = 0x0D; referenceOffset = i + 1; break; case '"': codepoint = 0x22; referenceOffset = i + 1; break; case '\'': codepoint = 0x27; referenceOffset = i + 1; break; case '\\': codepoint = 0x5C; referenceOffset = i + 1; break; case '/': codepoint = 0x2F; referenceOffset = i + 1; break; } if (codepoint == -1) { if (c1 == ESCAPE_XHEXA_PREFIX2) { // This can be a xhexa escape, we need exactly two more characters int f = i + 2; while (f < (i + 4) && f < max) { final char cf = text[f]; if (!((cf >= '0' && cf <= '9') || (cf >= 'A' && cf <= 'F') || (cf >= 'a' && cf <= 'f'))) { break; } f++; } if ((f - (i + 2)) < 2) { // We weren't able to consume the required two hexa chars, leave it as slash+'x', which // is invalid, and let the corresponding JavaScript engine fail. i++; continue; } codepoint = parseIntFromReference(text, i + 2, f, 16); // Fast-forward to the first char after the parsed codepoint referenceOffset = f - 1; // Don't continue here, just let the unescape code below do its job } else if (c1 == ESCAPE_UHEXA_PREFIX2) { // This can be a uhexa escape, we need exactly four more characters int f = i + 2; while (f < (i + 6) && f < max) { final char cf = text[f]; if (!((cf >= '0' && cf <= '9') || (cf >= 'A' && cf <= 'F') || (cf >= 'a' && cf <= 'f'))) { break; } f++; } if ((f - (i + 2)) < 4) { // We weren't able to consume the required four hexa chars, leave it as slash+'u', which // is invalid, and let the corresponding JavaScript engine fail. i++; continue; } codepoint = parseIntFromReference(text, i + 2, f, 16); // Fast-forward to the first char after the parsed codepoint referenceOffset = f - 1; // Don't continue here, just let the unescape code below do its job } else if (c1 >= '0' && c1 <= '7') { // This can be a octal escape, we need at least 1 more char, and up to 3 more. int f = i + 2; while (f < (i + 4) && f < max) { // We need only a max of two more chars final char cf = text[f]; if (!(cf >= '0' && cf <= '7')) { break; } f++; } codepoint = parseIntFromReference(text, i + 1, f, 8); if (codepoint > 0xFF) { // Maximum octal escape char is FF. Ignore the last digit codepoint = parseIntFromReference(text, i + 1, f - 1, 8); referenceOffset = f - 2; } else { referenceOffset = f - 1; } // Don't continue here, just let the unescape code below do its job } else if (c1 == '8' || c1 == '9' || c1 == '\n' || c1 == '\r' || c1 == '\u2028' || c1 == '\u2029') { // '8' and '9' are not valid octal escape sequences, and the other four characters // are LineTerminators, which are not allowed as escape sequences. So we leave it as is // and expect the corresponding JavaScript engine to fail (except in the case of slash + '\n', // which is considered a LineContinuator). i++; continue; } else { // We weren't able to consume any valid escape chars, just consider it a normal char, // which is allowed by the JavaScript specification (NonEscapeCharacter) codepoint = (int) c1; referenceOffset = i + 1; } } } /* * At this point we know for sure we will need some kind of unescape, so we * write all the contents pending up to this point. */ if (i - readOffset > 0) { writer.write(text, readOffset, (i - readOffset)); } i = referenceOffset; readOffset = i + 1; /* * -------------------------- * * Peform the real unescape * * -------------------------- */ if (codepoint > '\uFFFF') { writer.write(Character.toChars(codepoint)); } else { writer.write((char)codepoint); } } /* * ----------------------------------------------------------------------------------------------- * Final cleaning: writer the remaining escaped text and return. * ----------------------------------------------------------------------------------------------- */ if (max - readOffset > 0) { writer.write(text, readOffset, (max - readOffset)); } } }