/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package javax.mail.internet; /** * @version $Rev$ $Date$ */ public class HeaderTokenizer { public static class Token { // Constant values from J2SE 1.4 API Docs (Constant values) public static final int ATOM = -1; public static final int COMMENT = -3; public static final int EOF = -4; public static final int QUOTEDSTRING = -2; private final int _type; private final String _value; public Token(final int type, final String value) { _type = type; _value = value; } public int getType() { return _type; } public String getValue() { return _value; } } private static final char NUL = '\0'; private static final Token EOF = new Token(Token.EOF, null); // characters not allowed in MIME public static final String MIME = "()<>@,;:\\\"\t []/?="; // characters not allowed in RFC822 public static final String RFC822 = "()<>@,;:\\\"\t .[]"; private static final String WHITE = " \t\n\r"; private final String _delimiters; private final String _header; private final int _headerLength; private final boolean _skip; private int pos; public HeaderTokenizer(final String header) { this(header, RFC822); } public HeaderTokenizer(final String header, final String delimiters) { this(header, delimiters, true); } public HeaderTokenizer(final String header, final String delimiters, final boolean skipComments) { _skip = skipComments; _header = header; _delimiters = delimiters; _headerLength=header.length(); } //Return the rest of the Header. //null is returned if we are already at end of header public String getRemainder() { if(pos > _headerLength) { return null; } return _header.substring(pos); } public Token next() throws ParseException { return readToken(NUL, false); } /** * Parses the next token from this String. * If endOfAtom is not NUL, the token extends until the * endOfAtom character is seen, or to the end of the header. * This method is useful when parsing headers that don't * obey the MIME specification, e.g., by failing to quote * parameter values that contain spaces. * * @param endOfAtom if not NUL, character marking end of token * @return the next Token * @exception ParseException if the parse fails * @since JavaMail 1.5 */ public Token next(final char endOfAtom) throws ParseException { return next(endOfAtom, false); } /** * Parses the next token from this String. * endOfAtom is handled as above. If keepEscapes is true, * any backslash escapes are preserved in the returned string. * This method is useful when parsing headers that don't * obey the MIME specification, e.g., by failing to escape * backslashes in the filename parameter. * * @param endOfAtom if not NUL, character marking end of token * @param keepEscapes keep all backslashes in returned string? * @return the next Token * @exception ParseException if the parse fails * @since JavaMail 1.5 */ public Token next(final char endOfAtom, final boolean keepEscapes) throws ParseException { return readToken(endOfAtom, keepEscapes); } public Token peek() throws ParseException { final int start = pos; try { return readToken(NUL, false); } finally { pos = start; } } /** * Read an ATOM token from the parsed header. * * @return A token containing the value of the atom token. */ private Token readAtomicToken() { // skip to next delimiter final int start = pos; final StringBuilder sb = new StringBuilder(); sb.append(_header.charAt(pos)); while (++pos < _headerLength) { // break on the first non-atom character. final char ch = _header.charAt(pos); if ((_delimiters.indexOf(_header.charAt(pos)) != -1 || ch < 32 || ch >= 127)) { break; } } return new Token(Token.ATOM, _header.substring(start, pos)); } /** * Read the next token from the header. * * @return The next token from the header. White space is skipped, and comment * tokens are also skipped if indicated. * @exception ParseException */ private Token readToken(final char endOfAtom, final boolean keepEscapes) throws ParseException { if (pos >= _headerLength) { return EOF; } else { final char c = _header.charAt(pos); // comment token...read and skip over this if (c == '(') { final Token comment = readComment(keepEscapes); if (_skip) { return readToken(endOfAtom, keepEscapes); } else { return comment; } // quoted literal } else if (c == '\"') { return readQuotedString('"', keepEscapes, 1); // white space, eat this and find a real token. } else if (WHITE.indexOf(c) != -1) { eatWhiteSpace(); return readToken(endOfAtom, keepEscapes); // either a CTL or special. These characters have a self-defining token type. } else if (c < 32 || c >= 127 || _delimiters.indexOf(c) != -1) { if (endOfAtom != NUL && c != endOfAtom) { return readQuotedString(endOfAtom, keepEscapes, 0); } pos++; return new Token(c, String.valueOf(c)); } else { // start of an atom, parse it off. if (endOfAtom != NUL && c != endOfAtom) { return readQuotedString(endOfAtom, keepEscapes, 0); } return readAtomicToken(); } } } /** * Extract a substring from the header string and apply any * escaping/folding rules to the string. * * @param start The starting offset in the header. * @param end The header end offset + 1. * * @return The processed string value. * @exception ParseException */ private String getEscapedValue(final int start, final int end, final boolean keepEscapes) throws ParseException { final StringBuffer value = new StringBuffer(); for (int i = start; i < end; i++) { final char ch = _header.charAt(i); // is this an escape character? if (ch == '\\') { i++; if (i == end) { throw new ParseException("Invalid escape character"); } if(keepEscapes) { value.append("\\"); } value.append(_header.charAt(i)); } // line breaks are ignored, except for naked '\n' characters, which are consider // parts of linear whitespace. else if (ch == '\r') { // see if this is a CRLF sequence, and skip the second if it is. if (i < end - 1 && _header.charAt(i + 1) == '\n') { i++; } } else { // just append the ch value. value.append(ch); } } return value.toString(); } /** * Read a comment from the header, applying nesting and escape * rules to the content. * * @return A comment token with the token value. * @exception ParseException */ private Token readComment(final boolean keepEscapes) throws ParseException { final int start = pos + 1; int nesting = 1; boolean requiresEscaping = false; // skip to end of comment/string while (++pos < _headerLength) { final char ch = _header.charAt(pos); if (ch == ')') { nesting--; if (nesting == 0) { break; } } else if (ch == '(') { nesting++; } else if (ch == '\\') { pos++; requiresEscaping = true; } // we need to process line breaks also else if (ch == '\r') { requiresEscaping = true; } } if (nesting != 0) { throw new ParseException("Unbalanced comments"); } String value; if (requiresEscaping) { value = getEscapedValue(start, pos, keepEscapes); } else { value = _header.substring(start, pos++); } return new Token(Token.COMMENT, value); } /** * Parse out a quoted string from the header, applying escaping * rules to the value. * * @return The QUOTEDSTRING token with the value. * @exception ParseException */ private Token readQuotedString(final char endChar, final boolean keepEscapes, final int offset) throws ParseException { final int start = pos+offset; boolean requiresEscaping = false; // skip to end of comment/string while (++pos < _headerLength) { final char ch = _header.charAt(pos); if (ch == endChar) { String value; if (requiresEscaping) { value = getEscapedValue(start, pos++, keepEscapes); } else { value = _header.substring(start, pos++); } return new Token(Token.QUOTEDSTRING, value); } else if (ch == '\\') { pos++; requiresEscaping = true; } // we need to process line breaks also else if (ch == '\r') { requiresEscaping = true; } } throw new ParseException("Missing '\"'"); } /** * Skip white space in the token string. */ private void eatWhiteSpace() { // skip to end of whitespace while (++pos < _headerLength && WHITE.indexOf(_header.charAt(pos)) != -1) { ; } } }