/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdmodel.common.function.type4;
/**
* Parser for PDF Type 4 functions. This implements a small subset of the PostScript
* language but is no full PostScript interpreter.
*
* @version $Revision$
*/
public class Parser
{
/** Used to indicate the parsers current state. */
private static enum State
{
NEWLINE, WHITESPACE, COMMENT, TOKEN
}
private Parser()
{
//nop
}
/**
* Parses a Type 4 function and sends the syntactic elements to the given
* syntax handler.
* @param input the text source
* @param handler the syntax handler
*/
public static void parse(CharSequence input, SyntaxHandler handler)
{
Tokenizer tokenizer = new Tokenizer(input, handler);
tokenizer.tokenize();
}
/**
* This interface defines all possible syntactic elements of a Type 4 function.
* It is called by the parser as the function is interpreted.
*/
public interface SyntaxHandler
{
/**
* Indicates that a new line starts.
* @param text the new line character (CR, LF, CR/LF or FF)
*/
void newLine(CharSequence text);
/**
* Called when whitespace characters are encountered.
* @param text the whitespace text
*/
void whitespace(CharSequence text);
/**
* Called when a token is encountered. No distinction between operators and values
* is done here.
* @param text the token text
*/
void token(CharSequence text);
/**
* Called for a comment.
* @param text the comment
*/
void comment(CharSequence text);
}
/**
* Abstract base class for a {@link SyntaxHandler}.
*/
public abstract static class AbstractSyntaxHandler implements SyntaxHandler
{
/** {@inheritDoc} */
public void comment(CharSequence text)
{
//nop
}
/** {@inheritDoc} */
public void newLine(CharSequence text)
{
//nop
}
/** {@inheritDoc} */
public void whitespace(CharSequence text)
{
//nop
}
}
/**
* Tokenizer for Type 4 functions.
*/
private static class Tokenizer
{
private static final char NUL = '\u0000'; //NUL
private static final char EOT = '\u0004'; //END OF TRANSMISSION
private static final char TAB = '\u0009'; //TAB CHARACTER
private static final char FF = '\u000C'; //FORM FEED
private static final char CR = '\r'; //CARRIAGE RETURN
private static final char LF = '\n'; //LINE FEED
private static final char SPACE = '\u0020'; //SPACE
private CharSequence input;
private int index;
private SyntaxHandler handler;
private State state = State.WHITESPACE;
private StringBuilder buffer = new StringBuilder();
private Tokenizer(CharSequence text, SyntaxHandler syntaxHandler)
{
this.input = text;
this.handler = syntaxHandler;
}
private boolean hasMore()
{
return index < input.length();
}
private char currentChar()
{
return input.charAt(index);
}
private char nextChar()
{
index++;
if (!hasMore())
{
return EOT;
}
else
{
return currentChar();
}
}
private char peek()
{
if (index < input.length() - 1)
{
return input.charAt(index + 1);
}
else
{
return EOT;
}
}
private State nextState()
{
char ch = currentChar();
switch (ch)
{
case CR:
case LF:
case FF: //FF
state = State.NEWLINE;
break;
case NUL:
case TAB:
case SPACE:
state = State.WHITESPACE;
break;
case '%':
state = State.COMMENT;
break;
default:
state = State.TOKEN;
}
return state;
}
private void tokenize()
{
while (hasMore())
{
buffer.setLength(0);
nextState();
switch (state)
{
case NEWLINE:
scanNewLine();
break;
case WHITESPACE:
scanWhitespace();
break;
case COMMENT:
scanComment();
break;
default:
scanToken();
}
}
}
private void scanNewLine()
{
assert state == State.NEWLINE;
char ch = currentChar();
buffer.append(ch);
if (ch == CR)
{
if (peek() == LF)
{
//CRLF is treated as one newline
buffer.append(nextChar());
}
}
handler.newLine(buffer);
nextChar();
}
private void scanWhitespace()
{
assert state == State.WHITESPACE;
buffer.append(currentChar());
loop:
while (hasMore())
{
char ch = nextChar();
switch (ch)
{
case NUL:
case TAB:
case SPACE:
buffer.append(ch);
break;
default:
break loop;
}
}
handler.whitespace(buffer);
}
private void scanComment()
{
assert state == State.COMMENT;
buffer.append(currentChar());
loop:
while (hasMore())
{
char ch = nextChar();
switch (ch)
{
case CR:
case LF:
case FF:
break loop;
default:
buffer.append(ch);
}
}
//EOF reached
handler.comment(buffer);
}
private void scanToken()
{
assert state == State.TOKEN;
char ch = currentChar();
buffer.append(ch);
switch (ch)
{
case '{':
case '}':
handler.token(buffer);
nextChar();
return;
default:
//continue
}
loop:
while (hasMore())
{
ch = nextChar();
switch (ch)
{
case NUL:
case TAB:
case SPACE:
case CR:
case LF:
case FF:
case EOT:
case '{':
case '}':
break loop;
default:
buffer.append(ch);
}
}
//EOF reached
handler.token(buffer);
}
}
}