/**
* $Id: RtfTokeniser.java 2429 2006-10-06 14:58:54Z psoares33 $
* $Name$
*
* Copyright 2006 by Mark Hall
*
* The contents of this file are subject to the Mozilla Public License Version 1.1
* (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the License.
*
* The Original Code is 'iText, a free JAVA-PDF library'.
*
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
* the Initial Developer are Copyright (C) 1999-2006 by Bruno Lowagie.
* All Rights Reserved.
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
* are Copyright (C) 2000-2006 by Paulo Soares. All Rights Reserved.
*
* Contributor(s): all the names of the contributors are added in the source code
* where applicable.
*
* Alternatively, the contents of this file may be used under the terms of the
* LGPL license (the ?GNU LIBRARY GENERAL PUBLIC LICENSE?), in which case the
* provisions of LGPL are applicable instead of those above. If you wish to
* allow use of your version of this file only under the terms of the LGPL
* License and not to allow others to use your version of this file under
* the MPL, indicate your decision by deleting the provisions above and
* replace them with the notice and other provisions required by the LGPL.
* If you do not delete the provisions above, a recipient may use your version
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the MPL as stated above or under the terms of the GNU
* Library General Public License as published by the Free Software Foundation;
* either version 2 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
* details.
*
* If you didn't download this code from the following link, you should check if
* you aren't using an obsolete version:
* http://www.lowagie.com/iText/
*/
package com.lowagie.text.rtf.direct;
import java.io.IOException;
import java.io.Reader;
/**
* The RtfTokeniser takes an RTF document stream and
* turns it into a set of RTF tokens. Five groups of
* tokens are differentiated:
*
* <ul>
* <li>Group opening: {</li>
* <li>Group closing: }</li>
* <li>Control characters</li>
* <li>Control words</li>
* <li>Text</li>
* </ul>
*
* @version $Revision: 2429 $
* @author Mark Hall (mhall@edu.uni-klu.ac.at)
* @author Bullo (bullo70@users.sourceforge.net)
*/
public class RtfTokeniser {
/**
* The RtfTokeniser is in its ground state. Any token may follow.
*/
private static final int TOKENISER_STATE_READY = 0;
/**
* The last token parsed was a slash.
*/
private static final int TOKENISER_STATE_SLASH = 1;
/**
* The RtfTokeniser is currently tokenising a control word.
*/
private static final int TOKENISER_STATE_IN_CTRL_WORD = 2;
/**
* The RtfTokeniser is currently tokenising a text.
*/
private static final int TOKENISER_STATE_IN_TEXT = 4;
/**
* The current state of this RtfTokeniser.
*/
private int state = TOKENISER_STATE_READY;
/**
* The current group nesting level.
*/
private int groupLevel = 0;
/**
* The RtfParser to send tokens to.
*/
private RtfParser rtfParser = null;
/**
* Constructs a new RtfTokeniser. The startGroupLevel is required when parsing
* RTF fragments, since they are missing the opening group and closing group
* and thus this has to be set at the beginning.
*
* @param rtfParser The RtfParser to send tokens to.
* @param startGroupLevel The starting group nesting level. 0 for full documents, 1 for fragments.
*/
public RtfTokeniser(RtfParser rtfParser, int startGroupLevel) {
this.rtfParser = rtfParser;
this.groupLevel = startGroupLevel;
}
/**
* The main tokenisation method. Implements a LL(1) parser.
*
* @param reader The Reader to read the RTF document from.
* @throws IOException On I/O errors.
*/
public void tokenise(Reader reader) throws IOException {
char[] nextChar = new char[1];
StringBuffer temp = new StringBuffer();
this.state = TOKENISER_STATE_READY;
this.groupLevel = 0;
while(reader.read(nextChar) != -1) {
if(this.state == TOKENISER_STATE_READY) { // No influence from previous characters.
if(nextChar[0] == '{') { // Open a group
this.rtfParser.handleOpenGroup(this.groupLevel);
groupLevel++;
} else if(nextChar[0] == '}') { // Close a group
this.rtfParser.handleCloseGroup(this.groupLevel);
groupLevel--;
} else if(nextChar[0] == '\\') {
this.state = TOKENISER_STATE_SLASH;
temp = new StringBuffer();
} else {
this.state = TOKENISER_STATE_IN_TEXT;
temp.append(nextChar[0]);
}
} else if((this.state & TOKENISER_STATE_SLASH) == TOKENISER_STATE_SLASH) { // A slash signals a control character or word or an escaped character
if(nextChar[0] == '{') {
this.state = TOKENISER_STATE_IN_TEXT;
temp.append("\\{");
} else if(nextChar[0] == '}') {
this.state = TOKENISER_STATE_IN_TEXT;
temp.append("\\}");
} else if(nextChar[0] == '\\') {
this.state = TOKENISER_STATE_IN_TEXT;
temp.append("\\\\");
} else {
if((this.state & TOKENISER_STATE_IN_TEXT) == TOKENISER_STATE_IN_TEXT) { // A control word or character closes previous text token
this.rtfParser.handleText(temp.toString(), this.groupLevel);
temp = new StringBuffer();
}
if(nextChar[0] == '|') {
this.state = TOKENISER_STATE_READY;
this.rtfParser.handleCtrlCharacter("\\|", this.groupLevel);
} else if(nextChar[0] == '~') {
this.state = TOKENISER_STATE_READY;
this.rtfParser.handleCtrlCharacter("\\~", this.groupLevel);
} else if(nextChar[0] == '-') {
this.state = TOKENISER_STATE_READY;
this.rtfParser.handleCtrlCharacter("\\-", this.groupLevel);
} else if(nextChar[0] == '_') {
this.state = TOKENISER_STATE_READY;
this.rtfParser.handleCtrlCharacter("\\_", this.groupLevel);
} else if(nextChar[0] == ':') {
this.state = TOKENISER_STATE_READY;
this.rtfParser.handleCtrlCharacter("\\:", this.groupLevel);
} else if(nextChar[0] == '*') {
this.state = TOKENISER_STATE_READY;
this.rtfParser.handleCtrlCharacter("\\*", this.groupLevel);
} else {
this.state = TOKENISER_STATE_IN_CTRL_WORD;
temp = new StringBuffer("\\");
temp.append(nextChar[0]);
}
}
} else if(this.state == TOKENISER_STATE_IN_CTRL_WORD) { // Control words run until a space, close or open group or another control word is found.
if(nextChar[0] == '\n' || nextChar[0] == '\r') {
nextChar[0] = ' ';
}
if(nextChar[0] == '{') {
this.rtfParser.handleCtrlWord(temp.toString(), this.groupLevel);
this.rtfParser.handleOpenGroup(this.groupLevel);
groupLevel++;
this.state = TOKENISER_STATE_READY;
temp = new StringBuffer();
} else if(nextChar[0] == '}') {
this.rtfParser.handleCtrlWord(temp.toString(), this.groupLevel);
this.rtfParser.handleCloseGroup(this.groupLevel);
groupLevel--;
this.state = TOKENISER_STATE_READY;
temp = new StringBuffer();
} else if(nextChar[0] == '\\') {
this.rtfParser.handleCtrlWord(temp.toString(), this.groupLevel);
this.state = TOKENISER_STATE_SLASH;
temp = new StringBuffer();
} else if(nextChar[0] == ' ') {
this.rtfParser.handleCtrlWord(temp.toString(), this.groupLevel);
this.rtfParser.handleText(" ", this.groupLevel);
this.state = TOKENISER_STATE_READY;
temp = new StringBuffer();
} else if(nextChar[0] == ';') {
this.rtfParser.handleCtrlWord(temp.toString(), this.groupLevel);
this.rtfParser.handleText(";", this.groupLevel);
this.state = TOKENISER_STATE_READY;
temp = new StringBuffer();
} else {
temp.append(nextChar[0]);
}
} else if(this.state == TOKENISER_STATE_IN_TEXT) { // Text tokens are closed by control characters or words or open and close groups
if(nextChar[0] == '{') {
this.rtfParser.handleText(temp.toString(), this.groupLevel);
this.rtfParser.handleOpenGroup(this.groupLevel);
groupLevel++;
this.state = TOKENISER_STATE_READY;
temp = new StringBuffer();
} else if(nextChar[0] == '}') {
this.rtfParser.handleText(temp.toString(), this.groupLevel);
this.rtfParser.handleCloseGroup(this.groupLevel);
groupLevel--;
this.state = TOKENISER_STATE_READY;
temp = new StringBuffer();
} else if(nextChar[0] == '\\') {
this.state = TOKENISER_STATE_IN_TEXT | TOKENISER_STATE_SLASH;
} else {
temp.append(nextChar[0]);
}
}
}
if((this.state & TOKENISER_STATE_IN_TEXT) == TOKENISER_STATE_IN_TEXT && !temp.toString().equals("")) { // If at the end a text token was being parsed, emmit that token. Required for RTF fragments
this.rtfParser.handleText(temp.toString(), this.groupLevel);
}
}
}