/* * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.flex.compiler.internal.mxml; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.flex.compiler.common.ISourceLocation; import org.apache.flex.compiler.internal.parsing.ISourceFragment; import org.apache.flex.compiler.internal.parsing.SourceFragment; import org.apache.flex.compiler.problems.ICompilerProblem; import org.apache.flex.compiler.problems.MXMLInvalidEntityProblem; import org.apache.flex.compiler.problems.MXMLUnterminatedEntityProblem; /** * {@code EntityProcessor} is an all-static utility class used in parsing MXML. * It handles replacing character entity references * (such as <code><</code>, <code>™</code>, and <code>™</code>) * in XML text, and in attribute values, with the actual Unicode characters * that they represent. * <p> * The only named character entities that are supported are * <code>amp</code>, <code>apos</code>, <code>gt</code>, <code>lt</code>, * and <code>quot</code>. * <p> * Non-character entity references are not supported. * <p> * An {@code EntityProblem} is reported for each entity that cannot be replaced. * If any problems are reported, the output text is <code>null</code>. */ // TODO We probably want to do something smarter than call // EntityProcessor.replaceEntities() on every MXML text unit. // Maybe the lexer/parser should mark text units that contain entities. // Maybe it should make every entity a separate text unit. // We also have to deal with the issue of reporting offsets after an entity. public class EntityProcessor { private static final char AMPERSAND = '&'; private static final char SEMICOLON = ';'; private static final Pattern DECIMAL_PATTERN = Pattern.compile("#(\\d+)"); private static final Pattern HEX_PATTERN = Pattern.compile("#x([A-Fa-f\\d]+)"); /** * Replaces all character entity references in a string. * <p> * @param s The input string. * @param problems A collection of problems, to which {@code EntityProblem}s * are added if the entities in the input string are not recognized. * @return The output string, or <code>null</code> if there were entity problems. */ public static String parseAsString(String s, ISourceLocation location, MXMLDialect mxmlDialect, Collection<ICompilerProblem> problems) { StringBuilder sb = new StringBuilder(); // If the input string doesn't contain an ampersand character, // then there are no entities to replace; just return a single // fragment corresponding to the input string. int ampersandIndex = s.indexOf(AMPERSAND); if (ampersandIndex == -1) { return s; } else { // This variable will keep track of where we find the semicolon // that ends the entity. By initializing it to the position // before the beginning of the string, we can avoid having // special logic to pick up the part of the string before // the first entity; it becomes the same logic as picking up // the part of the string between the first semicolon and the // second ampersand. int semicolonIndex = -1; while (true) { // We've found an ampersand. // Build a fragment containing the text from the previous // semicolon (or the beginning) to the this ampersand. if (ampersandIndex > semicolonIndex + 1) { String text = s.substring(semicolonIndex + 1, ampersandIndex); sb.append(text); } // Since we found an ampersand that starts an entity, // look for a subsequent semicolon that ends it. // If it doesn't exist, report a problem. semicolonIndex = s.indexOf(SEMICOLON, ampersandIndex + 1); if (semicolonIndex == -1) { ICompilerProblem problem = new MXMLUnterminatedEntityProblem(location); problems.add(problem); break; // we can't do any further processing } // Extract and convert the entity between the ampersand and the semicolon. String physicalText = s.substring(ampersandIndex, semicolonIndex + 1); String entityName = s.substring(ampersandIndex + 1, semicolonIndex); int c = convertEntity(entityName, mxmlDialect); if (c == -1) { // If it doesn't convert to a character, create a problem and return null. ICompilerProblem problem = new MXMLInvalidEntityProblem(location, physicalText); problems.add(problem); } else { // If it does convert, add a fragment for the entity. String logicalText = String.copyValueOf(new char[] { (char)c }); sb.append(logicalText); } // Find the next ampersand after the semicolon. ampersandIndex = s.indexOf(AMPERSAND, semicolonIndex + 1); // If there isn't one, we're done. // Add a final fragment for the text after the last semicolon. if (ampersandIndex == -1) { if (semicolonIndex + 1 < s.length()) { String text = s.substring(semicolonIndex + 1); sb.append(text); } break; } } } return sb.toString(); } /** * Replaces all character entity references in a string. * <p> * @param s The input string. * @param problems A collection of problems, to which {@code EntityProblem}s * are added if the entities in the input string are not recognized. * @return The output string, or <code>null</code> if there were entity problems. */ public static ISourceFragment[] parse(String s, ISourceLocation location, MXMLDialect mxmlDialect, Collection<ICompilerProblem> problems) { List<ISourceFragment> fragmentList = new ArrayList<ISourceFragment>(); ISourceFragment fragment; // If the input string doesn't contain an ampersand character, // then there are no entities to replace; just return a single // fragment corresponding to the input string. int ampersandIndex = s.indexOf(AMPERSAND); if (ampersandIndex == -1) { if (s.length() > 0) { fragment = new SourceFragment(s, location); fragmentList.add(fragment); } } else { // This variable will keep track of where we find the semicolon // that ends the entity. By initializing it to the position // before the beginning of the string, we can avoid having // special logic to pick up the part of the string before // the first entity; it becomes the same logic as picking up // the part of the string between the first semicolon and the // second ampersand. int semicolonIndex = -1; int start = location.getStart(); int line = location.getLine(); int column = location.getColumn(); while (true) { // We've found an ampersand. // Build a fragment containing the text from the previous // semicolon (or the beginning) to the this ampersand. if (ampersandIndex > semicolonIndex + 1) { String text = s.substring(semicolonIndex + 1, ampersandIndex); fragment = new SourceFragment(text, text, start + semicolonIndex + 1, line, column + semicolonIndex + 1); fragmentList.add(fragment); } // Since we found an ampersand that starts an entity, // look for a subsequent semicolon that ends it. // If it doesn't exist, report a problem. semicolonIndex = s.indexOf(SEMICOLON, ampersandIndex + 1); if (semicolonIndex == -1) { ICompilerProblem problem = new MXMLUnterminatedEntityProblem(location); problems.add(problem); break; // we can't do any further processing } // Extract and convert the entity between the ampersand and the semicolon. String physicalText = s.substring(ampersandIndex, semicolonIndex + 1); String entityName = s.substring(ampersandIndex + 1, semicolonIndex); int c = convertEntity(entityName, mxmlDialect); if (c == -1) { // If it doesn't convert to a character, create a problem and return null. ICompilerProblem problem = new MXMLInvalidEntityProblem(location, physicalText); problems.add(problem); } else { // If it does convert, add a fragment for the entity. String logicalText = String.copyValueOf(new char[] { (char)c }); fragment = new SourceFragment(physicalText, logicalText, start + ampersandIndex, line, column + ampersandIndex); fragmentList.add(fragment); } // Find the next ampersand after the semicolon. ampersandIndex = s.indexOf(AMPERSAND, semicolonIndex + 1); // If there isn't one, we're done. // Add a final fragment for the text after the last semicolon. if (ampersandIndex == -1) { if (semicolonIndex + 1 < s.length()) { String text = s.substring(semicolonIndex + 1); fragment = new SourceFragment(text, text, start + semicolonIndex + 1, line, column + semicolonIndex + 1); fragmentList.add(fragment); } break; } } } return fragmentList.toArray(new ISourceFragment[0]); } private static int convertEntity(String entityName, MXMLDialect mxmlDialect) { Character ch = mxmlDialect.getNamedEntity(entityName); if (ch != null) return ch.charValue(); // TODO What happens with Unicode characters // outside the BMP, such as 𒍅 ? Matcher m = HEX_PATTERN.matcher(entityName); if (m.matches()) return Integer.parseInt(m.group(1), 16); m = DECIMAL_PATTERN.matcher(entityName); if (m.matches()) return Integer.parseInt(m.group(1)); return -1; } }