MXMLTokenizer.java example

Explorer
flex-falcon-master
/*
 *
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

package org.apache.flex.compiler.internal.parsing.mxml;

import java.io.Closeable;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.io.IOUtils;

import org.apache.flex.compiler.common.MutablePrefixMap;
import org.apache.flex.compiler.common.PrefixMap;
import org.apache.flex.compiler.filespecs.FileSpecification;
import org.apache.flex.compiler.filespecs.IFileSpecification;
import org.apache.flex.compiler.internal.parsing.as.ASTokenTypes;
import org.apache.flex.compiler.parsing.IMXMLToken;
import org.apache.flex.compiler.parsing.IMXMLTokenizer;
import org.apache.flex.compiler.parsing.MXMLTokenTypes;
import org.apache.flex.compiler.problems.ICompilerProblem;
import org.apache.flex.compiler.problems.InternalCompilerProblem2;
import org.apache.flex.utils.NonLockingStringReader;

/**
 * Tokenizes MXML files.  Uses RawTagTokenizer to get basic tokens.  Ignores comments (<!--...-->),
 * processing instructions (<?...?>), and whitespace.  Replaces CDATA tokens with text tokens (strips
 * out the cdata stuff.
 */
public class MXMLTokenizer implements IMXMLTokenizer, Closeable
{
	/**
	 * Start offset (for when you're parsing a section of the document that
	 * doesn't start at the beginning)
	 */
	protected int startOffset;
	
	private int tagDepth = -1;
	
	/**
	 * Specifies that we are within a tags content, ie inside < and >
	 */
	private boolean inTagContent = false;

	private RawMXMLTokenizer tokenizer;
	
	protected MXMLToken xmlNSToken = null;

	protected MutablePrefixMap rootPrefixMap;

	private MXMLToken postRepairToken = null; 
	
	private boolean isRepairing = true;
	
	private boolean wasRepaired = false;
	
	private static final int SIZE = 100;
	
	private List<ICompilerProblem> problems;

    private String path;
    
    private MXMLToken lastToken = null;
    
    private static final String SUB_SYSTEM = "MXMLTokenizer";

	/**
	 * Constructor
	 */
	public MXMLTokenizer(String path)
	{
	    tokenizer = new RawMXMLTokenizer();
	    tokenizer.setSourcePath(path);
        problems = new ArrayList<ICompilerProblem>();
        rootPrefixMap = new MutablePrefixMap();
        this.path = path;
	}
	
	public MXMLTokenizer() {
	    this("");
	}
	
	public MXMLTokenizer(IFileSpecification specification) {
	    this(specification.getPath());
	}
	
	/**
	 * Reparse constructor.  Allows you to start the tokenizer with a start
	 * offset (for when you're parsing a section of the document that doesn't
	 * start at the beginning).
	 * @param startOffset		Start offset
	 */
	public MXMLTokenizer(int startOffset)
	{
		this("");
	    this.startOffset = startOffset;
	}
	
	public void setPath(String path) {
	    this.path = path;
	    tokenizer.setSourcePath(path);
	}
	
	public void setReader(Reader reader) {
	    tokenizer.reset();
        tokenizer.yyreset(reader);
	}
	
    @Override
    public void close() throws IOException
    {
        if (tokenizer != null)
        {
            tokenizer.reset();
            tokenizer.yyclose(); //close the reader
        }
    }
    
    /**
	 * If it exists, return the PrefixMap from the last parse
	 * @return a {@link PrefixMap} or null
	 */
	public PrefixMap getPrefixMap() {
		return rootPrefixMap;
	}
	
	/**
	 * Sets a flag to indicate whether this tokenizer should try to repair its token stream
	 * @param isRepairing <code>true</code> to repair, <code>false</code> to not repair
	 */
	@Override
    public void setIsRepairing(boolean isRepairing) {
		this.isRepairing = isRepairing;
	}
	
	@Override
    public IMXMLToken[] getTokens(Reader reader) {
		List<MXMLToken> parseTokens = parseTokens(reader);
		return parseTokens.toArray( new IMXMLToken[0]);
	}

	@Override
    public IMXMLToken[] getTokens(String range) {
		List<MXMLToken> parseTokens = parseTokens(new NonLockingStringReader(range));
		return parseTokens.toArray( new IMXMLToken[0]);
	}
	
	/**
	 * Determines if the the tokenizer has encountered any problems as it lexed the given input
	 * @return true if we have encountered any problems
	 */
	public boolean hasTokenizationProblems() {
		return tokenizer.hasProblems() || problems.size() > 0;
	}
	
	/**
	 * Processes the given input and builds a {@link PrefixMap} for the root tag found within this document
	 */
	public PrefixMap getRootTagPrefixMap() {
		boolean cont = true;
		do {
			MXMLToken token = nextToken();
			if(token == null || token.isTagEnd()) {
				cont = false;
			}
		} while(cont);
		return rootPrefixMap;
	}

	/**
	 * Returns a collection of problems encountered while processing the given input
	 * @return a {@link Collection} of {@link ICompilerProblem} objects, or an empty {@link Collection}
	 */
	public List<ICompilerProblem> getTokenizationProblems() {
		ArrayList<ICompilerProblem> problems = new ArrayList<ICompilerProblem>(this.problems);
		problems.addAll(tokenizer.getProblems());
		return problems;
	}
	
	/**
     * Returns the next token that can be produced from the given input, without performing any repair code
     * @return an {@link MXMLToken} or null when no more tokens can be produced
     */
	private final MXMLToken nextTokenInternal() {
	    MXMLToken retVal = null;
        boolean cont = true;
        while(cont) {
            try
            {
                MXMLToken token = tokenizer.hasBufferToken() ? (MXMLToken)tokenizer.getBufferToken() : (MXMLToken)tokenizer.nextToken();
                if(token == null)
                    return null;
                MXMLToken mxmlToken = processToken(token);
                if(mxmlToken != null) {
                    retVal = mxmlToken;
                    return retVal;
                }
            }
            catch (Exception e)
            {
                ICompilerProblem problem = new InternalCompilerProblem2(path, e, SUB_SYSTEM); 
                problems.add(problem);
                return null;
            }
        }
        return null;
	}

	
	/**
	 * Returns the next token that can be produced from the given input
	 * @return an {@link MXMLToken} or null when no more tokens can be produced
	 */
	public MXMLToken nextToken() {
	    if(isRepairing) {
	        if(postRepairToken != null) {
                MXMLToken retVal = postRepairToken;
                postRepairToken = null;
                return retVal;
            }
	        MXMLToken mxmlToken = nextTokenInternal();
            MXMLToken addedToken = analyzeForEndTagProblems(mxmlToken);
            if(addedToken != null) {
                postRepairToken = mxmlToken;
                wasRepaired = true;   
                return addedToken;
            }
            return mxmlToken;
	    }
	    return nextTokenInternal();
	}

	/**
	 * Parse the contents of input
	 * @param input		Reader containing file to be parsed
	 * @return			List of MXMLTokens
	 */
	public List<MXMLToken> parseTokens(Reader input) {
		// Add fake characters onto the end of the stream to make it easier to handle
		// unclosed constructs like <![CDATA[ and <!--.
		wasRepaired = false;
		setReader(input);
		// Set the start offset in the tokenizer
		// This is done after setReader() as setReader() resets the tokenizer, setting yychar to 0
		tokenizer.setOffset(startOffset);
		MXMLToken token = null;
		List<MXMLToken> list = new ArrayList<MXMLToken>(SIZE);
		try {
			do {
			    token = nextToken();
			    if(token != null)
			        buildTokenList((MXMLToken)token.clone(), list);
			    
			}while(token != null);
			lastToken = null;
			return list;
		} finally {
			try {
				tokenizer.yyclose();
			} catch (IOException e) {
			    ICompilerProblem problem = new InternalCompilerProblem2(path, e, SUB_SYSTEM);
			    problems.add(problem);
			}
		}
	}
	
	// TODO: remove this. It now does nothing. See note below
	private MXMLToken analyzeForEndTagProblems(MXMLToken currentToken) {
        if(currentToken == null)
            return null;
        try {
            
            if(currentToken.isTagStart() && lastToken != null) {
                switch(lastToken.getType()) {
                    case MXMLTokenTypes.TOKEN_WHITESPACE:
                    case MXMLTokenTypes.TOKEN_PROCESSING_INSTRUCTION:
                    case MXMLTokenTypes.TOKEN_COMMENT:
                    case MXMLTokenTypes.TOKEN_ASDOC_COMMENT:
                    case MXMLTokenTypes.TOKEN_STRING:
                    case MXMLTokenTypes.TOKEN_TEXT:
                    case MXMLTokenTypes.TOKEN_CDATA:
                    case MXMLTokenTypes.TOKEN_TAG_END:
                    case MXMLTokenTypes.TOKEN_EMPTY_TAG_END:
                    case -1:
                        return null; //all legal to come before open tag start
                    default:
                        // turn off this logic that makes up a fake token. The MXMLData already
                        // known how do to this. And if we do it here, we lose the information that the repair
                        // was done. Since we actually care, this causes bugs.
                        return null;
                      
                }
            } 
            return null;
        }
        finally
        {
            lastToken = currentToken;
        }
    }
	
	/**
	 * Determines if any tokens were added as a side effect of repair.  This can only be called after a tokenize call
	 * @return true if the token stream was modified
	 */
	public boolean tokensWereRepaired() {
		return wasRepaired;
	}
	
	/**
	 * Processes tokens, performs various transforms on the tokens that we return, such as:
	 * <ul>
	 * <li>transform XMLNS style tokens to name tokens for easier consumption by clients</li>
	 * <li>filter out state combiner tokens</li>
	 * <li>track xmlns string values</li>
	 * </ul>
	 * Note that we don't modify/merge whitespace and text tokens here as there are a number
	 * of tests which are sensitive to whitespace, ie MetaMXMLSuite.
	 * @param token
	 * @return an {@link MXMLToken} or null if it was not accepted
	 */
	private MXMLToken processToken(final MXMLToken token) {
	    //TODO find xmlns uri values in the lexer instead of here
	    switch (token.getType())
        {
            // tags (and also DTD directives)
            case MXMLTokenTypes.TOKEN_OPEN_TAG_START:
            	tagDepth++;
            	inTagContent = true;
            	return token;
            case MXMLTokenTypes.TOKEN_CLOSE_TAG_START:
            	tagDepth--;
            	inTagContent = true;
            	return token;
            case MXMLTokenTypes.TOKEN_TAG_END:
            case MXMLTokenTypes.TOKEN_EMPTY_TAG_END:
                inTagContent = false;
                return token;
            // stuff inside tags
            case MXMLTokenTypes.TOKEN_EQUALS:
            //outside tags
            case MXMLTokenTypes.TOKEN_CDATA:
                return token;
            case MXMLTokenTypes.TOKEN_NAME:
                xmlNSToken = null;
                return token;
            case MXMLTokenTypes.TOKEN_XMLNS:
                token.setType(MXMLTokenTypes.TOKEN_NAME);
                xmlNSToken = token;
                return token;
            case MXMLTokenTypes.TOKEN_STRING:
                //if the current namespace we are tracking is not null, then this string should yield the namespace URI
            	//only track the namespace of the root document
            	if(xmlNSToken != null && tagDepth == 0) {
                    String prefix = "";
                    String text = xmlNSToken.getText();
                    if(text.length() > 5) { //has prefix
                        prefix = text.substring(6);
                    }
                    String nsText = token.getText();
                    String ns = nsText.length() > 1 ? 
                            nsText.substring(1, nsText.length() -1) : "";
                    rootPrefixMap.add(prefix, ns);
                }
                return token;
            // stuff outside tags
            default:
            {
                if(tagDepth != 0 && !tokenizer.isInE4XDatabinding() && !inTagContent) {
                    //probably mixed content.  Allow it and let it fail downstream if we're wrong
                    if(token.isLiteral() || token.getType() == ASTokenTypes.TOKEN_IDENTIFIER) {
                        token.setType(MXMLTokenTypes.TOKEN_TEXT);
                    }
                }
                return token;
            }
        }
	}

	/**
	 * Handles the addition of tokens to the internal token list.  Subclasses should override this method to handle
	 * different tokenizing strategies
	 * @param token The current token.
	 * @param list The list of tokens being built.
	 */
	protected void buildTokenList(MXMLToken token, List<MXMLToken> list)
	{
		if(token != null) {
		    list.add(token);
		}
	}
	
	public static void main(String[] args)
	{
        final FileSpecification fileSpec = new FileSpecification(args[0]);
        
        final MXMLTokenizer tokenizer = new MXMLTokenizer(fileSpec.getPath());
        try
        {
            List<MXMLToken> tokens = tokenizer.parseTokens(fileSpec.createReader());
            for (MXMLToken token : tokens)
            {
                System.out.println(token.toDumpString());
            }
        }
        catch (FileNotFoundException e)
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        finally
        {
            IOUtils.closeQuietly(tokenizer);
        }
	}
}