EntityProcessor.java example

/*
 *
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

package org.apache.flex.compiler.internal.mxml;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.flex.compiler.common.ISourceLocation;
import org.apache.flex.compiler.internal.parsing.ISourceFragment;
import org.apache.flex.compiler.internal.parsing.SourceFragment;
import org.apache.flex.compiler.problems.ICompilerProblem;
import org.apache.flex.compiler.problems.MXMLInvalidEntityProblem;
import org.apache.flex.compiler.problems.MXMLUnterminatedEntityProblem;

/**
 * {@code EntityProcessor} is an all-static utility class used in parsing MXML.
 * It handles replacing character entity references
 * (such as <code><</code>, <code>™</code>, and <code>™</code>)
 * in XML text, and in attribute values, with the actual Unicode characters
 * that they represent.
 * <p>
 * The only named character entities that are supported are
 * <code>amp</code>, <code>apos</code>, <code>gt</code>, <code>lt</code>,
 * and <code>quot</code>.
 * <p>
 * Non-character entity references are not supported.
 * <p>
 * An {@code EntityProblem} is reported for each entity that cannot be replaced.
 * If any problems are reported, the output text is <code>null</code>.
 */
// TODO We probably want to do something smarter than call
// EntityProcessor.replaceEntities() on every MXML text unit.
// Maybe the lexer/parser should mark text units that contain entities.
// Maybe it should make every entity a separate text unit.
// We also have to deal with the issue of reporting offsets after an entity.
public class EntityProcessor
{
    private static final char AMPERSAND = '&';
    
    private static final char SEMICOLON = ';';
    
    private static final Pattern DECIMAL_PATTERN = Pattern.compile("#(\\d+)");
        
    private static final Pattern HEX_PATTERN = Pattern.compile("#x([A-Fa-f\\d]+)");
    
    /**
     * Replaces all character entity references in a string.
     * <p>
     * @param s The input string.
     * @param problems A collection of problems, to which {@code EntityProblem}s
     * are added if the entities in the input string are not recognized.
     * @return The output string, or <code>null</code> if there were entity problems.
     */
    public static String parseAsString(String s, ISourceLocation location,
                                          MXMLDialect mxmlDialect,
                                          Collection<ICompilerProblem> problems)
    {
        StringBuilder sb = new StringBuilder();
        
        // If the input string doesn't contain an ampersand character,
        // then there are no entities to replace; just return a single
        // fragment corresponding to the input string.
        int ampersandIndex = s.indexOf(AMPERSAND);
        if (ampersandIndex == -1)
        {
            return s;
        }
        else
        {
            // This variable will keep track of where we find the semicolon
            // that ends the entity. By initializing it to the position
            // before the beginning of the string, we can avoid having
            // special logic to pick up the part of the string before
            // the first entity; it becomes the same logic as picking up
            // the part of the string between the first semicolon and the
            // second ampersand.
            int semicolonIndex = -1;
            
            while (true)
            {
                // We've found an ampersand.
                // Build a fragment containing the text from the previous
                // semicolon (or the beginning) to the this ampersand.
                if (ampersandIndex > semicolonIndex + 1)
                {
                    String text = s.substring(semicolonIndex + 1, ampersandIndex);
                    sb.append(text);
                }

                // Since we found an ampersand that starts an entity,
                // look for a subsequent semicolon that ends it.
                // If it doesn't exist, report a problem.
                semicolonIndex = s.indexOf(SEMICOLON, ampersandIndex + 1);
                if (semicolonIndex == -1)
                {
                    ICompilerProblem problem = new MXMLUnterminatedEntityProblem(location);
                    problems.add(problem);
                    break; // we can't do any further processing
                }
    
                // Extract and convert the entity between the ampersand and the semicolon.
                String physicalText = s.substring(ampersandIndex, semicolonIndex + 1);
                String entityName = s.substring(ampersandIndex + 1, semicolonIndex);
                int c = convertEntity(entityName, mxmlDialect);
                if (c == -1)
                {
                    // If it doesn't convert to a character, create a problem and return null.
                    ICompilerProblem problem = new MXMLInvalidEntityProblem(location, physicalText);
                    problems.add(problem);
                }
                else
                {
                    // If it does convert, add a fragment for the entity.
                    String logicalText = String.copyValueOf(new char[] { (char)c });
                    sb.append(logicalText);
                }
                
                // Find the next ampersand after the semicolon.
                ampersandIndex = s.indexOf(AMPERSAND, semicolonIndex + 1);
                
                // If there isn't one, we're done.
                // Add a final fragment for the text after the last semicolon.
                if (ampersandIndex == -1)
                {
                    if (semicolonIndex + 1 < s.length())
                    {
                        String text = s.substring(semicolonIndex + 1);
                        sb.append(text);
                    }
                    break;
                }
            }
        }
        
        return sb.toString();
    }

    /**
     * Replaces all character entity references in a string.
     * <p>
     * @param s The input string.
     * @param problems A collection of problems, to which {@code EntityProblem}s
     * are added if the entities in the input string are not recognized.
     * @return The output string, or <code>null</code> if there were entity problems.
     */
    public static ISourceFragment[] parse(String s, ISourceLocation location,
                                          MXMLDialect mxmlDialect,
                                          Collection<ICompilerProblem> problems)
    {
        List<ISourceFragment> fragmentList = new ArrayList<ISourceFragment>();
        
        ISourceFragment fragment;
        
        // If the input string doesn't contain an ampersand character,
        // then there are no entities to replace; just return a single
        // fragment corresponding to the input string.
        int ampersandIndex = s.indexOf(AMPERSAND);
        if (ampersandIndex == -1)
        {
            if (s.length() > 0)
            {
                fragment = new SourceFragment(s, location);
                fragmentList.add(fragment);
            }
        }
        else
        {
            // This variable will keep track of where we find the semicolon
            // that ends the entity. By initializing it to the position
            // before the beginning of the string, we can avoid having
            // special logic to pick up the part of the string before
            // the first entity; it becomes the same logic as picking up
            // the part of the string between the first semicolon and the
            // second ampersand.
            int semicolonIndex = -1;
            
            int start = location.getStart();
            int line = location.getLine();
            int column = location.getColumn();
            
            while (true)
            {
                // We've found an ampersand.
                // Build a fragment containing the text from the previous
                // semicolon (or the beginning) to the this ampersand.
                if (ampersandIndex > semicolonIndex + 1)
                {
                    String text = s.substring(semicolonIndex + 1, ampersandIndex);
                    fragment = new SourceFragment(text, text,
                        start + semicolonIndex + 1, line, column + semicolonIndex + 1);
                    fragmentList.add(fragment);
                }

                // Since we found an ampersand that starts an entity,
                // look for a subsequent semicolon that ends it.
                // If it doesn't exist, report a problem.
                semicolonIndex = s.indexOf(SEMICOLON, ampersandIndex + 1);
                if (semicolonIndex == -1)
                {
                    ICompilerProblem problem = new MXMLUnterminatedEntityProblem(location);
                    problems.add(problem);
                    break; // we can't do any further processing
                }
    
                // Extract and convert the entity between the ampersand and the semicolon.
                String physicalText = s.substring(ampersandIndex, semicolonIndex + 1);
                String entityName = s.substring(ampersandIndex + 1, semicolonIndex);
                int c = convertEntity(entityName, mxmlDialect);
                if (c == -1)
                {
                    // If it doesn't convert to a character, create a problem and return null.
                    ICompilerProblem problem = new MXMLInvalidEntityProblem(location, physicalText);
                    problems.add(problem);
                }
                else
                {
                    // If it does convert, add a fragment for the entity.
                    String logicalText = String.copyValueOf(new char[] { (char)c });
                    fragment = new SourceFragment(physicalText, logicalText,
                        start + ampersandIndex, line, column + ampersandIndex);
                    fragmentList.add(fragment);
                }
                
                // Find the next ampersand after the semicolon.
                ampersandIndex = s.indexOf(AMPERSAND, semicolonIndex + 1);
                
                // If there isn't one, we're done.
                // Add a final fragment for the text after the last semicolon.
                if (ampersandIndex == -1)
                {
                    if (semicolonIndex + 1 < s.length())
                    {
                        String text = s.substring(semicolonIndex + 1);
                        fragment = new SourceFragment(text, text,
                            start + semicolonIndex + 1, line, column + semicolonIndex + 1);
                        fragmentList.add(fragment);
                    }
                    break;
                }
            }
        }
        
        return fragmentList.toArray(new ISourceFragment[0]);
    }
    
    private static int convertEntity(String entityName, MXMLDialect mxmlDialect)
    {
        Character ch = mxmlDialect.getNamedEntity(entityName);
        if (ch != null)
            return ch.charValue();
        
        // TODO What happens with Unicode characters
        // outside the BMP, such as 𒍅 ?
        
        Matcher m = HEX_PATTERN.matcher(entityName);
        if (m.matches())
            return Integer.parseInt(m.group(1), 16);        
        
        m = DECIMAL_PATTERN.matcher(entityName);
        if (m.matches())
            return Integer.parseInt(m.group(1));
        
        return -1;
    }
}