Type1Lexer.java example

Explorer
pdfbox-master
- pdfbox-trunk
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.fontbox.type1;

import java.io.IOException;
import java.nio.ByteBuffer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Lexer for the ASCII portions of an Adobe Type 1 font.
 *
 * @see Type1Parser
 *
 * The PostScript language, of which Type 1 fonts are a subset, has a
 * somewhat awkward lexical structure. It is neither regular nor
 * context-free, and the execution of the program can modify the
 * the behaviour of the lexer/parser.
 *
 * Nevertheless, this class represents an attempt to artificially seperate
 * the PostScript parsing process into separate lexing and parsing phases
 * in order to reduce the complexity of the parsing phase.
 *
 * @see "PostScript Language Reference 3rd ed, Adobe Systems (1999)"
 *
 * @author John Hewson
 */
class Type1Lexer
{
    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(Type1Lexer.class);
    
    private final ByteBuffer buffer;
    private Token aheadToken;
    private int openParens = 0;

    /**
     * Constructs a new Type1Lexer given a header-less .pfb segment.
     * @param bytes Header-less .pfb segment
     * @throws IOException
     */
    Type1Lexer(byte[] bytes) throws IOException
    {
        buffer = ByteBuffer.wrap(bytes);
        aheadToken = readToken(null);
    }

    /**
     * Returns the next token and consumes it.
     * @return The next token.
     */
    public Token nextToken() throws IOException
    {
        Token curToken = aheadToken;
        //System.out.println(curToken); // for debugging
        aheadToken = readToken(curToken);
        return curToken;
    }

    /**
     * Returns the next token without consuming it.
     * @return The next token
     */
    public Token peekToken()
    {
        return aheadToken;
    }

    /**
     * Reads an ASCII char from the buffer.
     */
    private char getChar()
    {
        return (char) buffer.get();
    }

    /**
     * Reads a single token.
     * @param prevToken the previous token
     */
    private Token readToken(Token prevToken) throws IOException
    {
        boolean skip;
        do
        {
            skip = false;
            while (buffer.hasRemaining())
            {
                char c = getChar();

                // delimiters
                if (c == '%')
                {
                    // comment
                    readComment();
                }
                else if (c == '(')
                {
                    return readString();
                }
                else if (c == ')')
                {
                    // not allowed outside a string context
                    throw new IOException("unexpected closing parenthesis");
                }
                else if (c == '[')
                {
                    return new Token(c, Token.START_ARRAY);
                }
                else if (c == '{')
                {
                    return new Token(c, Token.START_PROC);
                }
                else if (c == ']')
                {
                    return new Token(c, Token.END_ARRAY);
                }
                else if (c == '}')
                {
                    return new Token(c, Token.END_PROC);
                }
                else if (c == '/')
                {
                    return new Token(readRegular(), Token.LITERAL);
                }
                else if (c == '<')
                {
                    char c2 = getChar();
                    if (c2 == c)
                    {
                        return new Token("<<", Token.START_DICT);
                    }
                    else
                    {
                        // code may have to be changed in something better, maybe new token type
                        buffer.position(buffer.position() - 1);
                        return new Token(c, Token.NAME);
                    }
                }
                else if (c == '>')
                {
                    char c2 = getChar();
                    if (c2 == c)
                    {
                        return new Token(">>", Token.END_DICT);
                    }
                    else
                    {
                        // code may have to be changed in something better, maybe new token type
                        buffer.position(buffer.position() - 1);
                        return new Token(c, Token.NAME);
                    }
                }
                else if (Character.isWhitespace(c))
                {
                    skip = true;
                }
                else if (c == 0)
                {
                    LOG.warn("NULL byte in font, skipped");
                    skip = true;
                }
                else
                {
                    buffer.position(buffer.position() -1);

                    // regular character: try parse as number
                    Token number = tryReadNumber();
                    if (number != null)
                    {
                        return number;
                    }
                    else
                    {
                        // otherwise this must be a name
                        String name = readRegular();
                        if (name == null)
                        {
                            // the stream is corrupt
                            throw new DamagedFontException("Could not read token at position " +
                                                           buffer.position());
                        }

                        if (name.equals("RD") || name.equals("-|"))
                        {
                            // return the next CharString instead
                            if (prevToken.getKind() == Token.INTEGER)
                            {
                                return readCharString(prevToken.intValue());
                            }
                            else
                            {
                                throw new IOException("expected INTEGER before -| or RD");
                            }
                        }
                        else
                        {
                            return new Token(name, Token.NAME);
                        }
                    }
                }
            }
        }
        while (skip);
        return null;
    }

    /**
     * Reads a number or returns null.
     */
    private Token tryReadNumber()
    {
        buffer.mark();

        StringBuilder sb = new StringBuilder();
        StringBuilder radix = null;
        char c = getChar();
        boolean hasDigit = false;

        // optional + or -
        if (c == '+' || c == '-')
        {
            sb.append(c);
            c = getChar();
        }

        // optional digits
        while (Character.isDigit(c))
        {
            sb.append(c);
            c = getChar();
            hasDigit = true;
        }

        // optional .
        if (c == '.')
        {
            sb.append(c);
            c = getChar();
        }
        else if (c == '#')
        {
            // PostScript radix number takes the form base#number
            radix = sb;
            sb = new StringBuilder();
            c = getChar();
        }
        else if (sb.length() == 0 || !hasDigit)
        {
            // failure
            buffer.reset();
            return null;
        }
        else
        {
            // integer
            buffer.position(buffer.position() -1);
            return new Token(sb.toString(), Token.INTEGER);
        }

        // required digit
        if (Character.isDigit(c))
        {
            sb.append(c);
            c = getChar();
        }
        else
        {
            // failure
            buffer.reset();
            return null;
        }

        // optional digits
        while (Character.isDigit(c))
        {
            sb.append(c);
            c = getChar();
        }

        // optional E
        if (c == 'E')
        {
            sb.append(c);
            c = getChar();
            
            // optional minus
            if (c == '-')
            {
                sb.append(c);
                c = getChar();
            }

            // required digit
            if (Character.isDigit(c))
            {
                sb.append(c);
                c = getChar();
            }
            else
            {
                // failure
                buffer.reset();
                return null;
            }

            // optional digits
            while (Character.isDigit(c))
            {
                sb.append(c);
                c = getChar();
            }
        }
        
        buffer.position(buffer.position() - 1);
        if (radix != null)
        {
            Integer val = Integer.parseInt(sb.toString(), Integer.parseInt(radix.toString()));
            return new Token(val.toString(), Token.INTEGER);
        }
        return new Token(sb.toString(), Token.REAL);
    }

    /**
     * Reads a sequence of regular characters, i.e. not delimiters
     * or whitespace
     */
    private String readRegular()
    {
        StringBuilder sb = new StringBuilder();
        while (buffer.hasRemaining())
        {
            buffer.mark();
            char c = getChar();
            if (Character.isWhitespace(c) ||
                c == '(' || c == ')' ||
                c == '<' || c == '>' ||
                c == '[' || c == ']' ||
                c == '{' || c == '}' ||
                c == '/' || c == '%' )
            {
                buffer.reset();
                break;
            }
            else
            {
                sb.append(c);
            }
        }
        String regular = sb.toString();
        if (regular.length() == 0)
        {
            return null;
        }
        return regular;
    }

    /**
     * Reads a line comment.
     */
    private String readComment()
    {
        StringBuilder sb = new StringBuilder();
        while (buffer.hasRemaining())
        {
            char c = getChar();
            if (c == '\r' || c == '\n')
            {
                break;
            }
            else
            {
                sb.append(c);
            }
        }
        return sb.toString();
    }

    /**
     * Reads a (string).
     */
    private Token readString()
    {
        StringBuilder sb = new StringBuilder();

        while (buffer.hasRemaining())
        {
            char c = getChar();

            // string context
            switch (c)
            {
                case '(':
                    openParens++;
                    sb.append('(');
                    break;
                case ')':
                    if (openParens == 0)
                    {
                        // end of string
                        return new Token(sb.toString(), Token.STRING);
                    }
                    sb.append(')');
                    openParens--;
                    break;
                case '\\':
                    // escapes: \n \r \t \b \f \\ \( \)
                    char c1 = getChar();
                    switch (c1)
                    {
                        case 'n':
                        case 'r': sb.append("\n"); break;
                        case 't': sb.append('\t'); break;
                        case 'b': sb.append('\b'); break;
                        case 'f': sb.append('\f'); break;
                        case '\\': sb.append('\\'); break;
                        case '(': sb.append('('); break;
                        case ')': sb.append(')'); break;
                    }   
                    // octal \ddd
                    if (Character.isDigit(c1))
                    {
                        String num = String.valueOf(new char[] { c1, getChar(), getChar() });
                        Integer code = Integer.parseInt(num, 8);
                        sb.append((char)(int)code);
                    }
                    break;
                case '\r':
                case '\n':
                    sb.append("\n");
                    break;
                default:
                    sb.append(c);
                    break;
            }
        }
        return null;
    }

    /**
     * Reads a binary CharString.
     */
    private Token readCharString(int length)
    {
        buffer.get(); // space
        byte[] data = new byte[length];
        buffer.get(data);
        return new Token(data, Token.CHARSTRING);
    }
}