/* * Copyright (C) 2002-2011 XimpleWare, info@ximpleware.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package com.ximpleware; //import com.ximpleware.parser.ISO8859_11; /** * VTD Generator implementation. * Current support built-in entities only * It parses DTD, but doesn't resolve declared entities */ public class VTDGenEx { class CharReader implements IReader { public CharReader() { } @Override final public int getChar() throws EOFException, ParseException, EncodingException { if (offset >= endOffset) { throw e; } return XMLDoc[offset++]; } @Override final public boolean skipChar(final int ch) throws ParseException, EOFException, EncodingException { if (ch == XMLDoc[offset]) { offset++; return true; } else { return false; } } @Override final public long _getChar(final int offset) { final int c = XMLDoc[offset]; if (c == '\r' && XMLDoc[offset + 1] == '\n') { return (2L << 32) | '\n'; } return (1L << 32) | c; } @Override final public char decode(final int offset) { return XMLDoc[offset]; } } // attr_name_array size private final static int ATTR_NAME_ARRAY_SIZE = 16; public final static int FORMAT_ASCII = 0; public final static int FORMAT_ISO_8859_1 = 1; public final static int FORMAT_ISO_8859_10 = 11; public final static int FORMAT_ISO_8859_11 = 12; public final static int FORMAT_ISO_8859_12 = 13; public final static int FORMAT_ISO_8859_13 = 14; public final static int FORMAT_ISO_8859_14 = 15; public final static int FORMAT_ISO_8859_15 = 16; public final static int FORMAT_ISO_8859_16 = 17; public final static int FORMAT_ISO_8859_2 = 3; public final static int FORMAT_ISO_8859_3 = 4; public final static int FORMAT_ISO_8859_4 = 5; public final static int FORMAT_ISO_8859_5 = 6; public final static int FORMAT_ISO_8859_6 = 7; public final static int FORMAT_ISO_8859_7 = 8; public final static int FORMAT_ISO_8859_8 = 9; public final static int FORMAT_ISO_8859_9 = 10; public final static int FORMAT_UTF_16BE = 63; public final static int FORMAT_UTF_16LE = 64; // encoding format public final static int FORMAT_UTF8 = 2; public final static int FORMAT_WIN_1250 = 18; public final static int FORMAT_WIN_1251 = 19; public final static int FORMAT_WIN_1252 = 20; public final static int FORMAT_WIN_1253 = 21; public final static int FORMAT_WIN_1254 = 22; public final static int FORMAT_WIN_1255 = 23; public final static int FORMAT_WIN_1256 = 24; public final static int FORMAT_WIN_1257 = 25; public final static int FORMAT_WIN_1258 = 26; public final static int MAX_DEPTH = 254; // maximum depth value // max prefix length public final static int MAX_PREFIX_LENGTH = (1 << 9) - 1; // max Qname length public final static int MAX_QNAME_LENGTH = (1 << 11) - 1; // max Token length public final static int MAX_TOKEN_LENGTH = (1 << 20) - 1; private final static int STATE_ATTR_NAME = 3; private final static int STATE_ATTR_VAL = 4; private final static int STATE_CDATA = 12; private final static int STATE_COMMENT = 11; private final static int STATE_DEC_ATTR_NAME = 10; private final static int STATE_DOC_END = 7; // end of document private final static int STATE_DOC_START = 6; // beginning of document private final static int STATE_DOCTYPE = 13; private final static int STATE_END_COMMENT = 14; // comment appear after the last ending tag private final static int STATE_END_PI = 15; private final static int STATE_END_TAG = 2; // internal parser state private final static int STATE_LT_SEEN = 0; // encounter the first < private final static int STATE_PI_TAG = 8; private final static int STATE_PI_VAL = 9; private final static int STATE_START_TAG = 1; private final static int STATE_TEXT = 5; // tag_stack size private final static int TAG_STACK_SIZE = 256; public final static int TOKEN_ATTR_NAME = 2; public final static int TOKEN_ATTR_NS = 3; public final static int TOKEN_ATTR_VAL = 4; public final static int TOKEN_CDATA_VAL = 11; public final static int TOKEN_CHARACTER_DATA = 5; public final static int TOKEN_COMMENT = 6; public final static int TOKEN_DEC_ATTR_NAME = 9; public final static int TOKEN_DEC_ATTR_VAL = 10; public final static int TOKEN_DOCUMENT = 13; public final static int TOKEN_DTD_VAL = 12; public final static int TOKEN_ENDING_TAG = 1; public final static int TOKEN_PI_NAME = 7; public final static int TOKEN_PI_VAL = 8; // private final static int STATE_END_PI_VAL = 17; // token type public final static int TOKEN_STARTING_TAG = 0; private long[] attr_name_array; private int attr_count; private long[] prefixed_attr_name_array; private int[] prefix_URL_array; private int prefixed_attr_count; protected boolean br; // buffer reuse private int ch; private int ch_temp; private int length1, length2; protected int depth; // protected int offset_adj; // determine the byte length for ':' for various encoding types protected int docLen; protected int docOffset; protected int encoding; // again, in terms of byte, not char as encoded in VTD protected int endOffset; private int increment; protected FastLongBuffer l1Buffer; protected FastLongBuffer l2Buffer; protected FastIntBuffer l3Buffer; protected FastLongBuffer _l3Buffer; protected FastLongBuffer _l4Buffer; protected FastIntBuffer _l5Buffer; protected FastIntBuffer nsBuffer1; protected FastLongBuffer nsBuffer2; protected FastLongBuffer nsBuffer3; protected long currentElementRecord; private int last_depth; private int last_l1_index; private int last_l2_index; // namespace aware flag protected boolean ns, is_ns; protected int offset; // this is byte offset, not char offset as encoded in VTD protected boolean ws; // to prserve whitespace or not, default to false protected int prev_offset; protected IReader r; protected int rootIndex; protected long[] tag_stack; private int temp_offset; protected FastLongBuffer VTDBuffer; protected int VTDDepth; // Maximum Depth of VTDs protected char[] XMLDoc; protected EOFException e; protected short LcDepth; protected boolean singleByteEncoding; /** * VTDGen constructor method. */ public VTDGenEx() { attr_name_array = new long[ATTR_NAME_ARRAY_SIZE]; prefixed_attr_name_array = new long[ATTR_NAME_ARRAY_SIZE]; prefix_URL_array = new int[ATTR_NAME_ARRAY_SIZE]; tag_stack = new long[TAG_STACK_SIZE]; VTDDepth = 0; LcDepth = 3; br = false; e = new EOFException("permature EOF reached, XML document incomplete"); ws = false; nsBuffer1 = new FastIntBuffer(4); nsBuffer2 = new FastLongBuffer(4); nsBuffer3 = new FastLongBuffer(4); currentElementRecord = 0; singleByteEncoding = true; } /** * Clear internal states so VTDGEn can process the next file. */ public void clear() { if (br == false) { VTDBuffer = null; l1Buffer = null; l2Buffer = null; l3Buffer = null; _l3Buffer = null; _l4Buffer = null; _l5Buffer = null; } XMLDoc = null; offset = temp_offset = 0; last_depth = last_l1_index = last_l2_index = 0; rootIndex = 0; depth = -1; increment = 1; ch = ch_temp = 0; nsBuffer1.size = 0; nsBuffer2.size = 0; nsBuffer3.size = 0; currentElementRecord = 0; } /** * Enable the parser to collect all white spaces, including the trivial white spaces * By default, trivial white spaces are ignored * * @param b */ public void enableIgnoredWhiteSpace(final boolean b) { ws = b; } /** * Set the XMLDoc container. Also set the offset and len of the document * with respect to the container. * * @param ba * byte[] * @param os * int (in byte) * @param len * int (in byte) */ public void setDoc(final char[] ba, final int os, final int len) { if (ba == null || os < 0 || len == 0 || ba.length < os + len) { throw new IllegalArgumentException("Illegal argument for setDoc"); } int a; br = false; depth = -1; increment = 1; ch = ch_temp = 0; temp_offset = 0; XMLDoc = ba; docOffset = offset = os; docLen = len; endOffset = os + len; last_l1_index = last_l2_index = last_depth = 0; currentElementRecord = 0; nsBuffer1.size = 0; nsBuffer2.size = 0; nsBuffer3.size = 0; r = new CharReader(); /* if (shallowDepth) */{ int i1 = 8, i2 = 9, i3 = 11; if (docLen <= 1024) { // a = 1024; //set the floor a = 6; i1 = 5; i2 = 5; i3 = 5; } else if (docLen <= 4096) { a = 7; i1 = 6; i2 = 6; i3 = 6; } else if (docLen <= 1024 * 16) { a = 8; i1 = 7; i2 = 7; i3 = 7; } else if (docLen <= 1024 * 16 * 4) { // a = 2048; a = 11; } else if (docLen <= 1024 * 256) { // a = 1024 * 4; a = 12; } else { // a = 1 << 15; a = 15; } VTDBuffer = new FastLongBuffer(a, len >> (a + 1)); l1Buffer = new FastLongBuffer(i1); l2Buffer = new FastLongBuffer(i2); l3Buffer = new FastIntBuffer(i3); } } /** * Generating VTD tokens and Location cache info. When set to true, * VTDGen conforms to XML namespace 1.0 spec * * @param NS * boolean Enable namespace or not * @throws ParseException * Super class for any exceptions during parsing. * @throws EOFException * End of file exception. * @throws EntityException * Entity resolution exception. * @throws EncodingException * UTF/native encoding exception. */ public void parse(final boolean NS) throws EncodingException, EOFException, EntityException, ParseException { // define internal variables ns = NS; length1 = length2 = 0; attr_count = prefixed_attr_count = 0 /* , ch = 0, ch_temp = 0 */; int parser_state = STATE_DOC_START; // boolean has_amp = false; is_ns = false; encoding = FORMAT_UTF8; boolean helper = false; boolean default_ns = false; // true xmlns='abc' boolean isXML = false; // true only for xmlns:xml singleByteEncoding = true; // enter the main finite state machine try { _writeVTD(0, 0, TOKEN_DOCUMENT, depth); while (true) { switch (parser_state) { case STATE_LT_SEEN: // if (depth < -1) // throw new ParseException("Other Errors: Invalid depth"); temp_offset = offset; ch = r.getChar(); if (XMLChar.isNameStartChar(ch)) { depth++; parser_state = STATE_START_TAG; } else { switch (ch) { case '/': parser_state = STATE_END_TAG; break; case '?': parser_state = process_qm_seen(); break; case '!': // three possibility (comment, CDATA, DOCTYPE) parser_state = process_ex_seen(); break; default: throw new ParseException("Other Error: Invalid char after <" + formatLineNumber()); } } break; case STATE_START_TAG: // name space is handled by do { ch = r.getChar(); if (XMLChar.isNameChar(ch)) { if (ch == ':') { length2 = offset - temp_offset - increment; if (ns && checkPrefix2(temp_offset, length2)) { throw new ParseException("xmlns can't be an element prefix " + formatLineNumber(offset)); } } } else { break; } ch = r.getChar(); if (XMLChar.isNameChar(ch)) { if (ch == ':') { length2 = offset - temp_offset - increment; if (ns && checkPrefix2(temp_offset, length2)) { throw new ParseException("xmlns can't be an element prefix " + formatLineNumber(offset)); } } } else { break; } } while (true); length1 = offset - temp_offset - increment; if (depth > MAX_DEPTH) { throw new ParseException("Other Error: Depth exceeds MAX_DEPTH" + formatLineNumber()); } // writeVTD(offset, TOKEN_STARTING_TAG, length2:length1, depth) final long x = ((long) length1 << 32) + temp_offset; tag_stack[depth] = x; // System.out.println( // " " + (temp_offset) + " " + length2 + ":" + length1 + " startingTag " + depth); if (depth > VTDDepth) { VTDDepth = depth; } // if (encoding < FORMAT_UTF_16BE){ if (singleByteEncoding) { if (length2 > MAX_PREFIX_LENGTH || length1 > MAX_QNAME_LENGTH) { throw new ParseException( "Token Length Error: Starting tag prefix or qname length too long" + formatLineNumber()); } writeVTD((temp_offset), (length2 << 11) | length1, TOKEN_STARTING_TAG, depth); } else { if (length2 > (MAX_PREFIX_LENGTH << 1) || length1 > (MAX_QNAME_LENGTH << 1)) { throw new ParseException( "Token Length Error: Starting tag prefix or qname length too long" + formatLineNumber()); } writeVTD((temp_offset) >> 1, (length2 << 10) | (length1 >> 1), TOKEN_STARTING_TAG, depth); } if (ns) { if (length2 != 0) { length2 += increment; currentElementRecord = (((long) ((length2 << 16) | length1)) << 32) | temp_offset; } else { currentElementRecord = 0; } if (depth <= nsBuffer1.size - 1) { nsBuffer1.size = depth; final int t = nsBuffer1.intAt(depth - 1) + 1; nsBuffer2.size = t; nsBuffer3.size = t; } } // offset += length1; length2 = 0; if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); if (XMLChar.isNameStartChar(ch)) { // seen an attribute here temp_offset = getPrevOffset(); parser_state = STATE_ATTR_NAME; break; } } helper = true; if (ch == '/') { depth--; helper = false; ch = r.getChar(); } if (ch == '>') { if (ns) { nsBuffer1.append(nsBuffer3.size - 1); if (currentElementRecord != 0) { qualifyElement(); } } // parser_state = processElementTail(helper); if (depth != -1) { temp_offset = offset; // ch = getCharAfterSe(); // consume WSs ch = getCharAfterS(); // consume WSs if (ch == '<') { if (ws) { addWhiteSpaceRecord(); } parser_state = STATE_LT_SEEN; if (r.skipChar('/')) { if (helper) { length1 = offset - temp_offset - (increment << 1); // if (length1 > 0) { // if (encoding < FORMAT_UTF_16BE) if (singleByteEncoding) { writeVTDText((temp_offset), length1, TOKEN_CHARACTER_DATA, depth); } else { writeVTDText((temp_offset) >> 1, (length1 >> 1), TOKEN_CHARACTER_DATA, depth); // } } } parser_state = STATE_END_TAG; break; } } else if (XMLChar.isContentChar(ch)) { // temp_offset = offset; parser_state = STATE_TEXT; } else { parser_state = STATE_TEXT; handleOtherTextChar2(ch); } } else { parser_state = STATE_DOC_END; } break; } throw new ParseException("Starting tag Error: Invalid char in starting tag" + formatLineNumber()); case STATE_END_TAG: temp_offset = offset; final int sos = (int) tag_stack[depth]; final int sl = (int) (tag_stack[depth] >> 32); offset = temp_offset + sl; if (offset >= endOffset) { throw new EOFException("permature EOF reached, XML document incomplete"); } for (int i = 0; i < sl; i++) { if (XMLDoc[sos + i] != XMLDoc[temp_offset + i]) { throw new ParseException("Ending tag error: Start/ending tag mismatch" + formatLineNumber()); } } depth--; ch = getCharAfterS(); if (ch != '>') { throw new ParseException("Ending tag error: Invalid char in ending tag " + formatLineNumber()); } if (depth != -1) { temp_offset = offset; ch = getCharAfterS(); if (ch == '<') { if (ws) { addWhiteSpaceRecord(); } parser_state = STATE_LT_SEEN; } else if (XMLChar.isContentChar(ch)) { parser_state = STATE_TEXT; } else { handleOtherTextChar2(ch); parser_state = STATE_TEXT; } } else { parser_state = STATE_DOC_END; } break; case STATE_ATTR_NAME: if (ch == 'x') { if (r.skipChar('m') && r.skipChar('l') && r.skipChar('n') && r.skipChar('s')) { ch = r.getChar(); if (ch == '=' || XMLChar.isSpaceChar(ch)) { is_ns = true; default_ns = true; } else if (ch == ':') { is_ns = true; // break; default_ns = false; } } } do { if (XMLChar.isNameChar(ch)) { if (ch == ':') { length2 = offset - temp_offset - increment; } } else { break; } ch = r.getChar(); } while (true); length1 = getPrevOffset() - temp_offset; if (is_ns && ns) { // make sure postfix isn't xmlns if (!default_ns) { if (increment == 1 && (length1 - length2 - 1 == 5) || (increment == 2 && (length1 - length2 - 2 == 10))) { disallow_xmlns(temp_offset + length2 + increment); } // if the post fix is xml, signal it if (increment == 1 && (length1 - length2 - 1 == 3) || (increment == 2 && (length1 - length2 - 2 == 6))) { isXML = matchXML(temp_offset + length2 + increment); } } } // check for uniqueness here checkAttributeUniqueness(); // after checking, write VTD if (is_ns) { // if the prefix is xmlns: or xmlns // if (encoding < FORMAT_UTF_16BE){ if (singleByteEncoding) { if (length2 > MAX_PREFIX_LENGTH || length1 > MAX_QNAME_LENGTH) { throw new ParseException( "Token length overflow error: Attr NS tag prefix or qname length too long" + formatLineNumber()); } _writeVTD(temp_offset, (length2 << 11) | length1, TOKEN_ATTR_NS, depth); } else { if (length2 > (MAX_PREFIX_LENGTH << 1) || length1 > (MAX_QNAME_LENGTH << 1)) { throw new ParseException( "Token length overflow error: Attr NS prefix or qname length too long" + formatLineNumber()); } _writeVTD(temp_offset >> 1, (length2 << 10) | (length1 >> 1), TOKEN_ATTR_NS, depth); } // append to nsBuffer2 if (ns) { // unprefixed xmlns are not recorded if (length2 != 0 && !isXML) { // nsBuffer2.append(VTDBuffer.size() - 1); final long l = ((long) ((length2 << 16) | length1)) << 32 | temp_offset; nsBuffer3.append(l); // byte offset and byte // length } } } else { // if (encoding < FORMAT_UTF_16BE){ if (singleByteEncoding) { if (length2 > MAX_PREFIX_LENGTH || length1 > MAX_QNAME_LENGTH) { throw new ParseException( "Token Length Error: Attr name prefix or qname length too long" + formatLineNumber()); } _writeVTD(temp_offset, (length2 << 11) | length1, TOKEN_ATTR_NAME, depth); } else { if (length2 > (MAX_PREFIX_LENGTH << 1) || length1 > (MAX_QNAME_LENGTH << 1)) { throw new ParseException( "Token Length overflow error: Attr name prefix or qname length too long" + formatLineNumber()); } _writeVTD(temp_offset >> 1, (length2 << 10) | (length1 >> 1), TOKEN_ATTR_NAME, depth); } } /* * System.out.println( * " " + temp_offset + " " + length2 + ":" + length1 + " attr name " + depth); */ length2 = 0; if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); } if (ch != '=') { throw new ParseException("Error in attr: invalid char" + formatLineNumber()); } ch_temp = getCharAfterS(); if (ch_temp != '"' && ch_temp != '\'') { throw new ParseException("Error in attr: invalid char (should be ' or \" )" + formatLineNumber()); } temp_offset = offset; parser_state = STATE_ATTR_VAL; break; case STATE_ATTR_VAL: do { ch = r.getChar(); if (XMLChar.isValidChar(ch) && ch != '<') { if (ch == ch_temp) { break; } if (ch == '&') { final int startOfEntityBody = offset; final int entity = entityIdentifier(); if (!XMLChar.isValidChar(entity)) { throw new ParseException("Error in attr: Invalid XML char" + formatLineNumber()); } XMLDoc[startOfEntityBody - 1] = (char) entity; for (int i = startOfEntityBody; i < offset; i++) { XMLDoc[i] = 0; } // as in vtd spec, we mark attr val with entities } } else { throw new ParseException("Error in attr: Invalid XML char" + formatLineNumber()); } } while (true); length1 = offset - temp_offset - increment; if (ns && is_ns) { if (!default_ns && length1 == 0) { throw new ParseException(" non-default ns URL can't be empty" + formatLineNumber()); } // identify nsURL return 0,1,2 final int t = identifyNsURL(temp_offset, length1); if (isXML) {// xmlns:xml if (t != 1) { // URL points to "http://www.w3.org/XML/1998/namespace" throw new ParseException("xmlns:xml can only point to" + "\"http://www.w3.org/XML/1998/namespace\"" + formatLineNumber()); } } else { if (!default_ns) { nsBuffer2.append(((long) temp_offset << 32) | length1); } if (t != 0) { if (t == 1) { throw new ParseException("namespace declaration can't point to" + " \"http://www.w3.org/XML/1998/namespace\"" + formatLineNumber()); } throw new ParseException("namespace declaration can't point to" + " \"http://www.w3.org/2000/xmlns/\"" + formatLineNumber()); } } // no ns URL points to // "http://www.w3.org/2000/xmlns/" // no ns URL points to // "http://www.w3.org/XML/1998/namespace" } if (singleByteEncoding) { // if (encoding < FORMAT_UTF_16BE){ if (length1 > MAX_TOKEN_LENGTH) { throw new ParseException("Token Length Error:" + " Attr val too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset, length1, TOKEN_ATTR_VAL, depth); } else { if (length1 > (MAX_TOKEN_LENGTH << 1)) { throw new ParseException("Token Length Error:" + " Attr val too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset >> 1, length1 >> 1, TOKEN_ATTR_VAL, depth); } isXML = false; is_ns = false; ch = r.getChar(); if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); if (XMLChar.isNameStartChar(ch)) { temp_offset = offset - increment; parser_state = STATE_ATTR_NAME; break; } } helper = true; if (ch == '/') { depth--; helper = false; ch = r.getChar(); } if (ch == '>') { if (ns) { nsBuffer1.append(nsBuffer3.size - 1); if (prefixed_attr_count > 0) { qualifyAttributes(); } if (prefixed_attr_count > 1) { checkQualifiedAttributeUniqueness(); } if (currentElementRecord != 0) { qualifyElement(); } prefixed_attr_count = 0; } attr_count = 0; // parser_state = processElementTail(helper); if (depth != -1) { temp_offset = offset; // ch = getCharAfterSe(); ch = getCharAfterS(); if (ch == '<') { if (ws) { addWhiteSpaceRecord(); } parser_state = STATE_LT_SEEN; if (r.skipChar('/')) { if (helper) { length1 = offset - temp_offset - (increment << 1); // if (length1 > 0) { if (singleByteEncoding) { writeVTDText((temp_offset), length1, TOKEN_CHARACTER_DATA, depth); } else { writeVTDText((temp_offset) >> 1, (length1 >> 1), TOKEN_CHARACTER_DATA, depth); // } } } parser_state = STATE_END_TAG; break; } } else if (XMLChar.isContentChar(ch)) { // temp_offset = offset; parser_state = STATE_TEXT; } else { handleOtherTextChar2(ch); parser_state = STATE_TEXT; } } else { parser_state = STATE_DOC_END; } break; } throw new ParseException("Starting tag Error: Invalid char in starting tag" + formatLineNumber()); case STATE_TEXT: if (depth == -1) { throw new ParseException("Error in text content: Char data at the wrong place" + formatLineNumber()); } do { ch = r.getChar(); // System.out.println(""+(char)ch); if (XMLChar.isContentChar(ch)) { } else if (ch == '<') { break; } else { handleOtherTextChar(ch); } ch = r.getChar(); if (XMLChar.isContentChar(ch)) { } else if (ch == '<') { break; } else { handleOtherTextChar(ch); } } while (true); length1 = offset - increment - temp_offset; if (singleByteEncoding) { writeVTDText(temp_offset, length1, TOKEN_CHARACTER_DATA, depth); } else { writeVTDText(temp_offset >> 1, length1 >> 1, TOKEN_CHARACTER_DATA, depth); } // has_amp = true; parser_state = STATE_LT_SEEN; break; case STATE_DOC_START: parser_state = process_start_doc(); break; case STATE_DOC_END: // docEnd = true; parser_state = process_end_doc(); break; case STATE_PI_TAG: parser_state = process_pi_tag(); break; // throw new ParseException("Error in PI: Invalid char"); case STATE_PI_VAL: parser_state = process_pi_val(); break; case STATE_DEC_ATTR_NAME: parser_state = process_dec_attr(); break; case STATE_COMMENT: parser_state = process_comment(); break; case STATE_CDATA: parser_state = process_cdata(); break; case STATE_DOCTYPE: parser_state = process_doc_type(); break; case STATE_END_COMMENT: parser_state = process_end_comment(); break; case STATE_END_PI: parser_state = process_end_pi(); break; default: throw new ParseException("Other error: invalid parser state" + formatLineNumber()); } } } catch (final EOFException e) { if (parser_state != STATE_DOC_END) { throw e; } finishUp(); } } /** * Write white space records that are ignored by default */ private void addWhiteSpaceRecord() { if (depth > -1) { final int length1 = offset - increment - temp_offset; if (length1 != 0) { if (singleByteEncoding) { writeVTDText(temp_offset, length1, TOKEN_CHARACTER_DATA, depth); } else { writeVTDText(temp_offset >> 1, length1 >> 1, TOKEN_CHARACTER_DATA, depth); } } } } /** * This method will detect whether the entity is valid or not and increment offset. * * @return int * @throws com.ximpleware.ParseException * Super class for any exception during parsing. * @throws com.ximpleware.EncodingException * UTF/native encoding exception. * @throws com.ximpleware.EOFException * End of file exception. */ private int entityIdentifier() throws EntityException, EncodingException, EOFException, ParseException { int ch = r.getChar(); int val = 0; switch (ch) { case '#': ch = r.getChar(); if (ch == 'x') { while (true) { ch = r.getChar(); if (ch >= '0' && ch <= '9') { val = (val << 4) + (ch - '0'); } else if (ch >= 'a' && ch <= 'f') { val = (val << 4) + (ch - 'a' + 10); } else if (ch >= 'A' && ch <= 'F') { val = (val << 4) + (ch - 'A' + 10); } else if (ch == ';') { return val; } else { throw new EntityException("Errors in char reference: Illegal char following &#x."); } } } else { while (true) { if (ch >= '0' && ch <= '9') { val = val * 10 + (ch - '0'); } else if (ch == ';') { break; } else { throw new EntityException("Errors in char reference: Illegal char following &#."); } ch = r.getChar(); } } if (!XMLChar.isValidChar(val)) { throw new EntityException("Errors in entity reference: Invalid XML char."); } return val; // break; case 'a': ch = r.getChar(); if (ch == 'm') { if (r.getChar() == 'p' && r.getChar() == ';') { // System.out.println(" entity for &"); return '&'; } else { throw new EntityException("Errors in Entity: Illegal builtin reference"); } } else if (ch == 'p') { if (r.getChar() == 'o' && r.getChar() == 's' && r.getChar() == ';') { // System.out.println(" entity for ' "); return '\''; } else { throw new EntityException("Errors in Entity: Illegal builtin reference"); } } else { throw new EntityException("Errors in Entity: Illegal builtin reference"); } case 'q': if (r.getChar() == 'u' && r.getChar() == 'o' && r.getChar() == 't' && r.getChar() == ';') { return '"'; } else { throw new EntityException("Errors in Entity: Illegal builtin reference"); } case 'l': if (r.getChar() == 't' && r.getChar() == ';') { return '<'; } else { throw new EntityException("Errors in Entity: Illegal builtin reference"); } // break; case 'g': if (r.getChar() == 't' && r.getChar() == ';') { return '>'; } else { throw new EntityException("Errors in Entity: Illegal builtin reference"); } default: throw new EntityException("Errors in Entity: Illegal entity char"); } // return val; } /** * Write the remaining portion of LC info * */ private void finishUp() { if (last_depth == 1) { l1Buffer.append(((long) last_l1_index << 32) | 0xffffffffL); } else if (last_depth == 2) { l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL); } } /** * Format the string indicating the position (line number:offset)of the offset if * there is an exception. * * @return java.lang.String indicating the line number and offset of the exception */ private String formatLineNumber() { return formatLineNumber(offset); } private String formatLineNumber(final int os) { int so = docOffset; int lineNumber = 0; int lineOffset = 0; if (encoding < FORMAT_UTF_16BE) { while (so <= os - 1) { if (XMLDoc[so] == '\n') { lineNumber++; lineOffset = so; } // lineOffset++; so++; } lineOffset = os - lineOffset; } else if (encoding == FORMAT_UTF_16BE) { while (so <= os - 2) { if (XMLDoc[so + 1] == '\n' && XMLDoc[so] == 0) { lineNumber++; lineOffset = so; } so += 2; } lineOffset = (os - lineOffset) >> 1; } else { while (so <= os - 2) { if (XMLDoc[so] == '\n' && XMLDoc[so + 1] == 0) { lineNumber++; lineOffset = so; } so += 2; } lineOffset = (os - lineOffset) >> 1; } return "\nLine Number: " + (lineNumber + 1) + " Offset: " + (lineOffset - 1); } /** * The entity ignorant version of getCharAfterS. * * @return int * @throws ParseException * @throws EncodingException * @throws com.ximpleware.EOFException */ final private int getCharAfterS() throws ParseException, EncodingException, EOFException { int n; do { n = r.getChar(); if ((n == ' ' || n == '\n' || n == '\t' || n == '\r')) { // if (XMLChar.isSpaceChar(n) ) { } else { return n; } n = r.getChar(); if ((n == ' ' || n == '\n' || n == '\t' || n == '\r')) { } else { return n; /* * if (n == ' ' || n == '\n' || n =='\t'|| n == '\r' ) { * } else * return n; */ } } while (true); // throw new EOFException("should never come here"); } /** * This method returns the VTDNav object after parsing, it also cleans * internal state so VTDGen can process the next file. * * @return com.ximpleware.VTDNav */ public VTDNavEx getNav() { // call VTDNav constructor final VTDNavEx vn = new VTDNavEx(rootIndex, encoding, ns, VTDDepth, XMLDoc, VTDBuffer, l1Buffer, l2Buffer, l3Buffer, docOffset, docLen); clear(); return vn; } /** * Get the offset value of previous character. * * @return int * @throws ParseException * Super class for exceptions during parsing. */ private int getPrevOffset() throws ParseException { int prevOffset = offset; int temp; switch (encoding) { case FORMAT_UTF8: do { prevOffset--; } while (XMLDoc[prevOffset] < 0 && ((XMLDoc[prevOffset] & (byte) 0xc0) == (byte) 0x80)); return prevOffset; case FORMAT_ASCII: case FORMAT_ISO_8859_1: case FORMAT_ISO_8859_2: case FORMAT_ISO_8859_3: case FORMAT_ISO_8859_4: case FORMAT_ISO_8859_5: case FORMAT_ISO_8859_6: case FORMAT_ISO_8859_7: case FORMAT_ISO_8859_8: case FORMAT_ISO_8859_9: case FORMAT_ISO_8859_10: case FORMAT_ISO_8859_11: case FORMAT_ISO_8859_13: case FORMAT_ISO_8859_14: case FORMAT_ISO_8859_15: case FORMAT_WIN_1250: case FORMAT_WIN_1251: case FORMAT_WIN_1252: case FORMAT_WIN_1253: case FORMAT_WIN_1254: case FORMAT_WIN_1255: case FORMAT_WIN_1256: case FORMAT_WIN_1257: case FORMAT_WIN_1258: return offset - 1; case FORMAT_UTF_16LE: temp = (XMLDoc[offset] & 0xff) << 8 | (XMLDoc[offset + 1] & 0xff); if (temp < 0xd800 || temp > 0xdfff) { return offset - 2; } else { return offset - 4; } case FORMAT_UTF_16BE: temp = (XMLDoc[offset] & 0xff) << 8 | (XMLDoc[offset + 1] & 0xff); if (temp < 0xd800 || temp > 0xdfff) { return offset - 2; } else { return offset - 4; } default: throw new ParseException("Other Error: Should never happen"); } } private void checkQualifiedAttributeUniqueness() throws ParseException { // TODO Auto-generated method stub int preLen1, os1, postLen1, URLLen1, URLOs1, preLen2, os2, postLen2, URLLen2, URLOs2, k; for (int i = 0; i < prefixed_attr_count; i++) { preLen1 = (int) ((prefixed_attr_name_array[i] & 0xffff0000L) >> 16); postLen1 = (int) ((prefixed_attr_name_array[i] & 0xffffL)) - preLen1 - increment; os1 = (int) (prefixed_attr_name_array[i] >> 32) + preLen1 + increment; URLLen1 = nsBuffer2.lower32At(prefix_URL_array[i]); URLOs1 = nsBuffer2.upper32At(prefix_URL_array[i]); for (int j = i + 1; j < prefixed_attr_count; j++) { // prefix of i matches that of j preLen2 = (int) ((prefixed_attr_name_array[j] & 0xffff0000L) >> 16); postLen2 = (int) ((prefixed_attr_name_array[j] & 0xffffL)) - preLen2 - increment; os2 = (int) (prefixed_attr_name_array[j] >> 32) + preLen2 + increment; // System.out.println(new String(XMLDoc,os1, postLen1) // +" "+ new String(XMLDoc, os2, postLen2)); if (postLen1 == postLen2) { k = 0; for (; k < postLen1; k++) { // System.out.println(i+" "+(char)(XMLDoc[os+k])+"<===>"+(char)(XMLDoc[preOs+k])); if (XMLDoc[os1 + k] != XMLDoc[os2 + k]) { break; } } if (k == postLen1) { // found the match URLLen2 = nsBuffer2.lower32At(prefix_URL_array[j]); URLOs2 = nsBuffer2.upper32At(prefix_URL_array[j]); // System.out.println(" URLOs1 ===>" + URLOs1); // System.out.println("nsBuffer2 ===>"+nsBuffer2.longAt(i)+" i==>"+i); // System.out.println("URLLen2 "+ URLLen2+" URLLen1 "+ URLLen1+" "); if (matchURL(URLOs1, URLLen1, URLOs2, URLLen2)) { throw new ParseException(" qualified attribute names collide " + formatLineNumber(os2)); } } } } // System.out.println("======"); } } private void qualifyAttributes() throws ParseException { final int i1 = nsBuffer3.size - 1; int j = 0, i = 0; // two cases: // 1. the current element has no prefix, look for xmlns // 2. the current element has prefix, look for xmlns:something while (j < prefixed_attr_count) { final int preLen = (int) ((prefixed_attr_name_array[j] & 0xffff0000L) >> 16); final int preOs = (int) (prefixed_attr_name_array[j] >> 32); // System.out.println(new String(XMLDoc, preOs, preLen)+"==="); i = i1; while (i >= 0) { final int t = nsBuffer3.upper32At(i); // with prefix, get full length and prefix length if ((t & 0xffff) - (t >> 16) == preLen + increment) { // doing byte comparison here final int os = nsBuffer3.lower32At(i) + (t >> 16) + increment; // System.out.println(new String(XMLDoc, os, preLen)+""); int k = 0; for (; k < preLen; k++) { // System.out.println(i+" "+(char)(XMLDoc[os+k])+"<===>"+(char)(XMLDoc[preOs+k])); if (XMLDoc[os + k] != XMLDoc[preOs + k]) { break; } } if (k == preLen) { break; // found the match } } /* * if ( (nsBuffer3.upper32At(i) & 0xffff0000) == 0){ * return; * } */ i--; } if (i < 0) { throw new ParseException("Name space qualification Exception: prefixed attribute not qualified\n" + formatLineNumber(preOs)); } else { prefix_URL_array[j] = i; } j++; // no need to check if xml is the prefix } } // return 0, 1 or 2 private int identifyNsURL(final int byte_offset, final int length) { // TODO Auto-generated method stub // URL points to "http://www.w3.org/XML/1998/namespace" return 1 // URL points to "http://www.w3.org/2000/xmlns/" return 2 final String URL1 = "2000/xmlns/"; final String URL2 = "http://www.w3.org/XML/1998/namespace"; long l; int i, t; final int g = byte_offset + length; int os = byte_offset; if (length < 29 || (increment == 2 && length < 58)) { return 0; } for (i = 0; i < 18 && os < g; i++) { l = _getCharResolved(os); // System.out.println("char ==>"+(char)l); if (URL2.charAt(i) != (int) l) { return 0; } os += (int) (l >> 32); } // store offset value t = os; for (i = 0; i < 11 && os < g; i++) { l = _getCharResolved(os); if (URL1.charAt(i) != (int) l) { break; } os += (int) (l >> 32); } if (os == g) { return 2; } // so far a match os = t; for (i = 18; i < 36 && os < g; i++) { l = _getCharResolved(os); if (URL2.charAt(i) != (int) l) { return 0; } os += (int) (l >> 32); } if (os == g) { return 1; } return 0; } private boolean matchXML(final int byte_offset) { // TODO Auto-generated method stub if (encoding < FORMAT_UTF_16BE) { if (XMLDoc[byte_offset] == 'x' && XMLDoc[byte_offset + 1] == 'm' && XMLDoc[byte_offset + 2] == 'l') { return true; } } else { if (encoding == FORMAT_UTF_16LE) { if (XMLDoc[byte_offset] == 'x' && XMLDoc[byte_offset + 1] == 0 && XMLDoc[byte_offset + 2] == 'm' && XMLDoc[byte_offset + 3] == 0 && XMLDoc[byte_offset + 4] == 'l' && XMLDoc[byte_offset + 5] == 0) { return true; } } else { if (XMLDoc[byte_offset] == 0 && XMLDoc[byte_offset + 1] == 'x' && XMLDoc[byte_offset + 2] == 0 && XMLDoc[byte_offset + 3] == 'm' && XMLDoc[byte_offset + 4] == 0 && XMLDoc[byte_offset + 5] == 'l') { return true; } } } return false; } private void disallow_xmlns(final int byte_offset) throws ParseException { // TODO Auto-generated method stub if (encoding < FORMAT_UTF_16BE) { if (XMLDoc[byte_offset] == 'x' && XMLDoc[byte_offset + 1] == 'm' && XMLDoc[byte_offset + 2] == 'l' && XMLDoc[byte_offset + 3] == 'n' && XMLDoc[byte_offset + 4] == 's') { throw new ParseException("xmlns as a ns prefix can't be re-declared" + formatLineNumber(byte_offset)); } } else { if (encoding == FORMAT_UTF_16LE) { if (XMLDoc[byte_offset] == 'x' && XMLDoc[byte_offset + 1] == 0 && XMLDoc[byte_offset + 2] == 'm' && XMLDoc[byte_offset + 3] == 0 && XMLDoc[byte_offset + 4] == 'l' && XMLDoc[byte_offset + 5] == 0 && XMLDoc[byte_offset + 6] == 'n' && XMLDoc[byte_offset + 7] == 0 && XMLDoc[byte_offset + 8] == 's' && XMLDoc[byte_offset + 9] == 0) { throw new ParseException("xmlns as a ns prefix can't be re-declared" + formatLineNumber(byte_offset)); } } else { if (XMLDoc[byte_offset] == 0 && XMLDoc[byte_offset + 1] == 'x' && XMLDoc[byte_offset + 2] == 0 && XMLDoc[byte_offset + 3] == 'm' && XMLDoc[byte_offset + 4] == 0 && XMLDoc[byte_offset + 5] == 'l' && XMLDoc[byte_offset + 6] == 0 && XMLDoc[byte_offset + 7] == 'n' && XMLDoc[byte_offset + 8] == 0 && XMLDoc[byte_offset + 9] == 's') { throw new ParseException("xmlns as a ns prefix can't be re-declared" + formatLineNumber(byte_offset)); } } } } // private /** * This private method processes CDATA section * * @return the parser state after which the parser loop jumps to * @throws ParseException * @throws EncodingException * @throws EOFException */ private int process_cdata() throws ParseException, EncodingException, EOFException { int parser_state; while (true) { ch = r.getChar(); if (XMLChar.isValidChar(ch)) { if (ch == ']' && r.skipChar(']')) { while (r.skipChar(']')) { ; } if (r.skipChar('>')) { break; } /* * else * throw new ParseException( * "Error in CDATA: Invalid termination sequence" * + formatLineNumber()); */ } } else { throw new ParseException("Error in CDATA: Invalid Char" + formatLineNumber()); } } length1 = offset - temp_offset - (increment << 1) - increment; if (singleByteEncoding) {// if (encoding < FORMAT_UTF_16BE){ writeVTDText(temp_offset, length1, TOKEN_CDATA_VAL, depth); } else { writeVTDText(temp_offset >> 1, length1 >> 1, TOKEN_CDATA_VAL, depth); } // System.out.println(" " + (temp_offset) + " " + length1 + " CDATA " + depth); temp_offset = offset; // ch = getCharAfterSe(); ch = getCharAfterS(); if (ch == '<') { if (ws) { addWhiteSpaceRecord(); } parser_state = STATE_LT_SEEN; } else if (XMLChar.isContentChar(ch)) { // temp_offset = offset-1; parser_state = STATE_TEXT; } else if (ch == '&') { final int startOfEntityBody = offset; final int entity = entityIdentifier(); XMLDoc[startOfEntityBody - 1] = (char) entity; for (int i = startOfEntityBody; i < offset; i++) { XMLDoc[i] = 0; } parser_state = STATE_TEXT; // temp_offset = offset; } else if (ch == ']') { // temp_offset = offset-1; if (r.skipChar(']')) { while (r.skipChar(']')) { } if (r.skipChar('>')) { throw new ParseException("Error in text content: ]]> in text content" + formatLineNumber()); } } parser_state = STATE_TEXT; } else { throw new ParseException("Other Error: Invalid char in xml" + formatLineNumber()); } return parser_state; } /** * This private method process comment * * @return the parser state after which the parser loop jumps to * @throws ParseException * @throws EncodingException * @throws EOFException */ private int process_comment() throws ParseException, EncodingException, EOFException { int parser_state; while (true) { ch = r.getChar(); if (XMLChar.isValidChar(ch)) { if (ch == '-' && r.skipChar('-')) { length1 = offset - temp_offset - (increment << 1); break; } } else { throw new ParseException("Error in comment: Invalid Char" + formatLineNumber()); } } if (r.getChar() == '>') { // System.out.println(" " + (temp_offset) + " " + length1 + " comment " + depth); if (singleByteEncoding) { writeVTDText(temp_offset, length1, TOKEN_COMMENT, depth); } else { writeVTDText(temp_offset >> 1, length1 >> 1, TOKEN_COMMENT, depth); } // length1 = 0; temp_offset = offset; // ch = getCharAfterSe(); ch = getCharAfterS(); if (ch == '<') { if (ws) { addWhiteSpaceRecord(); } parser_state = STATE_LT_SEEN; } else if (XMLChar.isContentChar(ch)) { // temp_offset = offset; parser_state = STATE_TEXT; } else if (ch == '&') { final int startOfEntityBody = offset; final int entity = entityIdentifier(); XMLDoc[startOfEntityBody - 1] = (char) entity; for (int i = startOfEntityBody; i < offset; i++) { XMLDoc[i] = 0; } parser_state = STATE_TEXT; } else if (ch == ']') { if (r.skipChar(']')) { while (r.skipChar(']')) { } if (r.skipChar('>')) { throw new ParseException("Error in text content: ]]> in text content" + formatLineNumber()); } } parser_state = STATE_TEXT; } else { throw new ParseException("Error in text content: Invalid char" + formatLineNumber()); } return parser_state; } else { throw new ParseException("Error in comment: Invalid terminating sequence" + formatLineNumber()); } } /** * This private method processes declaration attributes * * @return the parser state after which the parser loop jumps to * @throws ParseException * @throws EncodingException * @throws EOFException */ private int process_dec_attr() throws ParseException, EncodingException, EOFException { int parser_state; if (ch == 'v' && r.skipChar('e') && r.skipChar('r') && r.skipChar('s') && r.skipChar('i') && r.skipChar('o') && r.skipChar('n')) { ch = getCharAfterS(); if (ch == '=') { /* * System.out.println( * " " + (temp_offset - 1) + " " + 7 + " dec attr name version " + depth); */ if (singleByteEncoding) { _writeVTD(temp_offset - 1, 7, TOKEN_DEC_ATTR_NAME, depth); } else { _writeVTD((temp_offset - 2) >> 1, 7, TOKEN_DEC_ATTR_NAME, depth); } } else { throw new ParseException("XML decl error: Invalid char" + formatLineNumber()); } } else { throw new ParseException("XML decl error: should be version" + formatLineNumber()); } ch_temp = getCharAfterS(); if (ch_temp != '\'' && ch_temp != '"') { throw new ParseException("XML decl error: Invalid char to start attr name" + formatLineNumber()); } temp_offset = offset; // support 1.0 or 1.1 if (r.skipChar('1') && r.skipChar('.') && (r.skipChar('0') || r.skipChar('1'))) { /* * System.out.println( * " " + temp_offset + " " + 3 + " dec attr val (version)" + depth); */ if (singleByteEncoding) { _writeVTD(temp_offset, 3, TOKEN_DEC_ATTR_VAL, depth); } else { _writeVTD(temp_offset >> 1, 3, TOKEN_DEC_ATTR_VAL, depth); } } else { throw new ParseException("XML decl error: Invalid version(other than 1.0 or 1.1) detected" + formatLineNumber()); } if (!r.skipChar(ch_temp)) { throw new ParseException("XML decl error: version not terminated properly" + formatLineNumber()); } ch = r.getChar(); // ? space or e if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); temp_offset = offset - increment; if (ch == 'e') { if (r.skipChar('n') && r.skipChar('c') && r.skipChar('o') && r.skipChar('d') && r.skipChar('i') && r.skipChar('n') && r.skipChar('g')) { ch = r.getChar(); if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); } if (ch == '=') { /* * System.out.println( * " " + (temp_offset) + " " + 8 + " dec attr name (encoding) " + depth); */ if (singleByteEncoding) { _writeVTD(temp_offset, 8, TOKEN_DEC_ATTR_NAME, depth); } else { _writeVTD(temp_offset >> 1, 8, TOKEN_DEC_ATTR_NAME, depth); } } else { throw new ParseException("XML decl error: Invalid char" + formatLineNumber()); } ch_temp = getCharAfterS(); if (ch_temp != '"' && ch_temp != '\'') { throw new ParseException("XML decl error: Invalid char to start attr name" + formatLineNumber()); } temp_offset = offset; ch = r.getChar(); switch (ch) { case 'a': case 'A': if ((r.skipChar('s') || r.skipChar('S')) && (r.skipChar('c') || r.skipChar('C')) && (r.skipChar('i') || r.skipChar('I')) && (r.skipChar('i') || r.skipChar('I')) && r.skipChar(ch_temp)) { if (encoding != FORMAT_UTF_16LE && encoding != FORMAT_UTF_16BE) { // if (must_utf_8) { // throw new EncodingException("Can't switch from UTF-8" + formatLineNumber()); // } // encoding = FORMAT_ASCII; // r = new ASCIIReader(); /* * System.out.println( * " " + (temp_offset) + " " + 5 + " dec attr val (encoding) " + depth); */ _writeVTD(temp_offset, 5, TOKEN_DEC_ATTR_VAL, depth); break; } else { throw new ParseException("XML decl error: Can't switch encoding to ASCII" + formatLineNumber()); } } throw new ParseException("XML decl error: Invalid Encoding" + formatLineNumber()); case 'c': case 'C': // matchCPEncoding(); break; case 'i': case 'I': // matchISOEncoding(); break; case 'u': case 'U': // matchUTFEncoding(); break; // now deal with windows encoding case 'w': case 'W': // matchWindowsEncoding(); break; default: throw new ParseException("XML decl Error: invalid encoding" + formatLineNumber()); } ch = r.getChar(); if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); } temp_offset = offset - increment; } else { throw new ParseException("XML decl Error: Invalid char" + formatLineNumber()); } } if (ch == 's') { if (r.skipChar('t') && r.skipChar('a') && r.skipChar('n') && r.skipChar('d') && r.skipChar('a') && r.skipChar('l') && r.skipChar('o') && r.skipChar('n') && r.skipChar('e')) { ch = getCharAfterS(); if (ch != '=') { throw new ParseException("XML decl error: Invalid char" + formatLineNumber()); } /* * System.out.println( * " " + temp_offset + " " + 3 + " dec attr name (standalone) " + depth); */ if (singleByteEncoding) { _writeVTD(temp_offset, 10, TOKEN_DEC_ATTR_NAME, depth); } else { _writeVTD(temp_offset >> 1, 10, TOKEN_DEC_ATTR_NAME, depth); } ch_temp = getCharAfterS(); temp_offset = offset; if (ch_temp != '"' && ch_temp != '\'') { throw new ParseException("XML decl error: Invalid char to start attr name" + formatLineNumber()); } ch = r.getChar(); if (ch == 'y') { if (r.skipChar('e') && r.skipChar('s') && r.skipChar(ch_temp)) { /* * System.out.println( * " " + (temp_offset) + " " + 3 + " dec attr val (standalone) " + depth); */ if (singleByteEncoding) { _writeVTD(temp_offset, 3, TOKEN_DEC_ATTR_VAL, depth); } else { _writeVTD(temp_offset >> 1, 3, TOKEN_DEC_ATTR_VAL, depth); } } else { throw new ParseException("XML decl error: invalid val for standalone" + formatLineNumber()); } } else if (ch == 'n') { if (r.skipChar('o') && r.skipChar(ch_temp)) { /* * System.out.println( * " " + (temp_offset) + " " + 2 + " dec attr val (standalone)" + depth); */ if (singleByteEncoding) { _writeVTD(temp_offset, 2, TOKEN_DEC_ATTR_VAL, depth); } else { _writeVTD(temp_offset >> 1, 2, TOKEN_DEC_ATTR_VAL, depth); } } else { throw new ParseException("XML decl error: invalid val for standalone" + formatLineNumber()); } } else { throw new ParseException("XML decl error: invalid val for standalone" + formatLineNumber()); } } else { throw new ParseException("XML decl error" + formatLineNumber()); } ch = r.getChar(); if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); } } } if (ch == '?' && r.skipChar('>')) { temp_offset = offset; ch = getCharAfterS(); if (ch == '<') { parser_state = STATE_LT_SEEN; } else { throw new ParseException("Other Error: Invalid Char in XML" + formatLineNumber()); } } else { throw new ParseException("XML decl Error: Invalid termination sequence" + formatLineNumber()); } return parser_state; } /** * This private method process DTD * * @return the parser state after which the parser loop jumps to * @throws ParseException * @throws EncodingException * @throws EOFException */ private int process_doc_type() throws ParseException, EncodingException, EOFException { int z = 1, parser_state; while (true) { ch = r.getChar(); if (XMLChar.isValidChar(ch)) { if (ch == '>') { z--; } else if (ch == '<') { z++; } if (z == 0) { break; } } else { throw new ParseException("Error in DOCTYPE: Invalid char" + formatLineNumber()); } } length1 = offset - temp_offset - increment; /* * System.out.println( * " " + (temp_offset) + " " + length1 + " DOCTYPE val " + depth); */ if (singleByteEncoding) {// if (encoding < FORMAT_UTF_16BE){ if (length1 > MAX_TOKEN_LENGTH) { throw new ParseException("Token Length Error:" + " DTD val too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset, length1, TOKEN_DTD_VAL, depth); } else { if (length1 > (MAX_TOKEN_LENGTH << 1)) { throw new ParseException("Token Length Error:" + " DTD val too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset >> 1, length1 >> 1, TOKEN_DTD_VAL, depth); } ch = getCharAfterS(); if (ch == '<') { parser_state = STATE_LT_SEEN; } else { throw new ParseException("Other Error: Invalid char in xml" + formatLineNumber()); } return parser_state; } /** * This private method process the comment after the root document * * @return the parser state after which the parser loop jumps to * @throws ParseException */ private int process_end_comment() throws ParseException { int parser_state; while (true) { ch = r.getChar(); if (XMLChar.isValidChar(ch)) { if (ch == '-' && r.skipChar('-')) { length1 = offset - temp_offset - (increment << 1); break; } } else { throw new ParseException("Error in comment: Invalid Char" + formatLineNumber()); } } if (r.getChar() == '>') { // System.out.println(" " + temp_offset + " " + length1 + " comment " + depth); if (singleByteEncoding) { writeVTDText(temp_offset, length1, TOKEN_COMMENT, depth); } else { writeVTDText(temp_offset >> 1, length1 >> 1, TOKEN_COMMENT, depth); } parser_state = STATE_DOC_END; return parser_state; } throw new ParseException("Error in comment: '-->' expected" + formatLineNumber()); } private int process_end_doc() throws ParseException, EncodingException, EOFException { int parser_state; ch = getCharAfterS(); /* eof exception should be thrown here for premature ending */ if (ch == '<') { if (r.skipChar('?')) { /* processing instruction after end tag of root element */ temp_offset = offset; parser_state = STATE_END_PI; return parser_state; } else if (r.skipChar('!') && r.skipChar('-') && r.skipChar('-')) { // comments allowed after the end tag of the root element temp_offset = offset; parser_state = STATE_END_COMMENT; return parser_state; } } throw new ParseException("Other Error: XML not terminated properly" + formatLineNumber()); } /** * This private method processes PI after root document * * @return the parser state after which the parser loop jumps to * @throws ParseException * @throws EncodingException * @throws EOFException */ private int process_end_pi() throws ParseException, EncodingException, EOFException { int parser_state; ch = r.getChar(); if (XMLChar.isNameStartChar(ch)) { if ((ch == 'x' || ch == 'X') && (r.skipChar('m') || r.skipChar('M')) && (r.skipChar('l') && r.skipChar('L'))) { // temp_offset = offset; ch = r.getChar(); if (XMLChar.isSpaceChar(ch) || ch == '?') { throw new ParseException("Error in PI: [xX][mM][lL] not a valid PI target" + formatLineNumber()); // offset = temp_offset; } } while (true) { // ch = getChar(); if (!XMLChar.isNameChar(ch)) { break; } ch = r.getChar(); } length1 = offset - temp_offset - increment; /* * System.out.println( * "" * + (char) XMLDoc[temp_offset] * + " " * + (temp_offset) * + " " * + length1 * + " PI Target " * + depth); */ if (singleByteEncoding) {// if (encoding < FORMAT_UTF_16BE){ if (length1 > MAX_TOKEN_LENGTH) { throw new ParseException("Token Length Error:" + "PI name too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset, length1, TOKEN_PI_NAME, depth); } else { if (length1 > (MAX_TOKEN_LENGTH << 1)) { throw new ParseException("Token Length Error:" + "PI name too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset >> 1, length1 >> 1, TOKEN_PI_NAME, depth); } // length1 = 0; temp_offset = offset; if (XMLChar.isSpaceChar(ch)) { ch = getCharAfterS(); while (true) { if (XMLChar.isValidChar(ch)) { if (ch == '?') { if (r.skipChar('>')) { parser_state = STATE_DOC_END; break; } else { throw new ParseException("Error in PI: invalid termination sequence" + formatLineNumber()); } } } else { throw new ParseException("Error in PI: Invalid char in PI val" + formatLineNumber()); } ch = r.getChar(); } length1 = offset - temp_offset - (increment << 1); if (singleByteEncoding) { if (length1 > MAX_TOKEN_LENGTH) { throw new ParseException("Token Length Error:" + "PI val too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset, length1, TOKEN_PI_VAL, depth); } else { if (length1 > (MAX_TOKEN_LENGTH << 1)) { throw new ParseException("Token Length Error:" + "PI val too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset >> 1, length1 >> 1, TOKEN_PI_VAL, depth); } // System.out.println(" " + temp_offset + " " + length1 + " PI val " + depth); } else { if (singleByteEncoding) { _writeVTD((temp_offset), 0, TOKEN_PI_VAL, depth); } else { _writeVTD((temp_offset) >> 1, 0, TOKEN_PI_VAL, depth); } if ((ch == '?') && r.skipChar('>')) { parser_state = STATE_DOC_END; } else { throw new ParseException("Error in PI: invalid termination sequence" + formatLineNumber()); } } // parser_state = STATE_DOC_END; } else { throw new ParseException("Error in PI: invalid char in PI target" + formatLineNumber()); } return parser_state; } private int process_ex_seen() throws ParseException, EncodingException, EOFException { int parser_state; boolean hasDTD = false; ch = r.getChar(); switch (ch) { case '-': if (r.skipChar('-')) { temp_offset = offset; parser_state = STATE_COMMENT; break; } else { throw new ParseException("Error in comment: Invalid char sequence to start a comment" + formatLineNumber()); } case '[': if (r.skipChar('C') && r.skipChar('D') && r.skipChar('A') && r.skipChar('T') && r.skipChar('A') && r.skipChar('[') && (depth != -1)) { temp_offset = offset; parser_state = STATE_CDATA; break; } else { if (depth == -1) { throw new ParseException("Error in CDATA: Wrong place for CDATA" + formatLineNumber()); } throw new ParseException("Error in CDATA: Invalid char sequence for CDATA" + formatLineNumber()); } case 'D': if (r.skipChar('O') && r.skipChar('C') && r.skipChar('T') && r.skipChar('Y') && r.skipChar('P') && r.skipChar('E') && (depth == -1) && !hasDTD) { hasDTD = true; temp_offset = offset; parser_state = STATE_DOCTYPE; break; } else { if (hasDTD == true) { throw new ParseException("Error for DOCTYPE: Only DOCTYPE allowed" + formatLineNumber()); } if (depth != -1) { throw new ParseException("Error for DOCTYPE: DTD at wrong place" + formatLineNumber()); } throw new ParseException("Error for DOCTYPE: Invalid char sequence for DOCTYPE" + formatLineNumber()); } default: throw new ParseException("Other Error: Unrecognized char after <!" + formatLineNumber()); } return parser_state; } /** * This private method processes PI tag * * @return the parser state after which the parser loop jumps to * @throws ParseException * @throws EncodingException * @throws EOFException */ private int process_pi_tag() throws ParseException, EncodingException, EOFException { int parser_state; while (true) { ch = r.getChar(); if (!XMLChar.isNameChar(ch)) { break; // System.out.println(" ch ==> "+(char)ch); } } length1 = offset - temp_offset - increment; /* * System.out.println( * ((char) XMLDoc[temp_offset]) * + " " * + (temp_offset) * + " " * + length1 * + " PI Target " * + depth); */ // if (encoding < FORMAT_UTF_16BE){ if (singleByteEncoding) { if (length1 > MAX_TOKEN_LENGTH) { throw new ParseException("Token Length Error:" + " PI name too long (>0xfffff)" + formatLineNumber()); } _writeVTD((temp_offset), length1, TOKEN_PI_NAME, depth); } else { if (length1 > (MAX_TOKEN_LENGTH << 1)) { throw new ParseException("Token Length Error:" + " PI name too long (>0xfffff)" + formatLineNumber()); } _writeVTD((temp_offset) >> 1, (length1 >> 1), TOKEN_PI_NAME, depth); } // length1 = 0; // temp_offset = offset; /* * if (XMLChar.isSpaceChar(ch)) { * ch = r.getChar(); * } */ // ch = r.getChar(); if (ch == '?') { // insert zero length pi name tag if (singleByteEncoding) { _writeVTD((temp_offset), 0, TOKEN_PI_VAL, depth); } else { _writeVTD((temp_offset) >> 1, (0), TOKEN_PI_VAL, depth); } if (r.skipChar('>')) { temp_offset = offset; // ch = getCharAfterSe(); ch = getCharAfterS(); if (ch == '<') { if (ws) { addWhiteSpaceRecord(); } parser_state = STATE_LT_SEEN; } else if (XMLChar.isContentChar(ch)) { parser_state = STATE_TEXT; } else if (ch == '&') { // has_amp = true; final int startOfEntityBody = offset; final int entity = entityIdentifier(); XMLDoc[startOfEntityBody - 1] = (char) entity; for (int i = startOfEntityBody; i < offset; i++) { XMLDoc[i] = 0; } parser_state = STATE_TEXT; } else if (ch == ']') { if (r.skipChar(']')) { while (r.skipChar(']')) { } if (r.skipChar('>')) { throw new ParseException("Error in text content: ]]> in text content" + formatLineNumber()); } } parser_state = STATE_TEXT; } else { throw new ParseException("Error in text content: Invalid char" + formatLineNumber()); } return parser_state; } else { throw new ParseException("Error in PI: invalid termination sequence" + formatLineNumber()); } } parser_state = STATE_PI_VAL; return parser_state; } /** * This private method processes PI val * * @return the parser state after which the parser loop jumps to * @throws ParseException * @throws EncodingException * @throws EOFException */ private int process_pi_val() throws ParseException, EncodingException, EOFException { int parser_state; if (!XMLChar.isSpaceChar(ch)) { throw new ParseException("Error in PI: invalid termination sequence" + formatLineNumber()); } temp_offset = offset; ch = r.getChar(); while (true) { if (XMLChar.isValidChar(ch)) { // System.out.println(""+(char)ch); if (ch == '?') { if (r.skipChar('>')) { break; } /* * else * throw new ParseException( * "Error in PI: invalid termination sequence for PI" * + formatLineNumber()); */ } } else { throw new ParseException("Errors in PI: Invalid char in PI val" + formatLineNumber()); } ch = r.getChar(); } length1 = offset - temp_offset - (increment << 1); /* * System.out.println( * ((char) XMLDoc[temp_offset]) * + " " * + (temp_offset) * + " " * + length1 * + " PI val " * + depth); */ // if (length1 != 0) if (singleByteEncoding) {// if (encoding < FORMAT_UTF_16BE){ if (length1 > MAX_TOKEN_LENGTH) { throw new ParseException("Token Length Error:" + "PI VAL too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset, length1, TOKEN_PI_VAL, depth); } else { if (length1 > (MAX_TOKEN_LENGTH << 1)) { throw new ParseException("Token Length Error:" + "PI VAL too long (>0xfffff)" + formatLineNumber()); } _writeVTD(temp_offset >> 1, length1 >> 1, TOKEN_PI_VAL, depth); } // length1 = 0; temp_offset = offset; // ch = getCharAfterSe(); ch = getCharAfterS(); if (ch == '<') { if (ws) { addWhiteSpaceRecord(); } parser_state = STATE_LT_SEEN; } else if (XMLChar.isContentChar(ch)) { // temp_offset = offset; parser_state = STATE_TEXT; } else if (ch == '&') { // has_amp = true; // temp_offset = offset; final int startOfEntityBody = offset; final int entity = entityIdentifier(); XMLDoc[startOfEntityBody - 1] = (char) entity; for (int i = startOfEntityBody; i < offset; i++) { XMLDoc[i] = 0; } parser_state = STATE_TEXT; } else if (ch == ']') { if (r.skipChar(']')) { while (r.skipChar(']')) { } if (r.skipChar('>')) { throw new ParseException("Error in text content: ]]> in text content" + formatLineNumber()); } } parser_state = STATE_TEXT; } else { throw new ParseException("Error in text content: Invalid char" + formatLineNumber()); } return parser_state; } private int process_qm_seen() throws ParseException, EncodingException, EOFException { temp_offset = offset; ch = r.getChar(); if (XMLChar.isNameStartChar(ch)) { // temp_offset = offset; if ((ch == 'x' || ch == 'X') && (r.skipChar('m') || r.skipChar('M')) && (r.skipChar('l') || r.skipChar('L'))) { ch = r.getChar(); if (ch == '?' || XMLChar.isSpaceChar(ch)) { throw new ParseException("Error in PI: [xX][mM][lL] not a valid PI targetname" + formatLineNumber()); } offset = getPrevOffset(); } return STATE_PI_TAG; } throw new ParseException("Other Error: First char after <? invalid" + formatLineNumber()); } private int process_start_doc() throws ParseException, EncodingException, EOFException { final int c = r.getChar(); if (c == '<') { temp_offset = offset; // xml decl has to be right after the start of the document if (r.skipChar('?') && (r.skipChar('x') || r.skipChar('X')) && (r.skipChar('m') || r.skipChar('M')) && (r.skipChar('l') || r.skipChar('L'))) { if (r.skipChar(' ') || r.skipChar('\t') || r.skipChar('\n') || r.skipChar('\r')) { ch = getCharAfterS(); temp_offset = offset; return STATE_DEC_ATTR_NAME; } else if (r.skipChar('?')) { throw new ParseException("Error in XML decl: Premature ending" + formatLineNumber()); } } offset = temp_offset; return STATE_LT_SEEN; } else if (c == ' ' || c == '\n' || c == '\r' || c == '\t') { if (getCharAfterS() == '<') { return STATE_LT_SEEN; } } throw new ParseException("Other Error: XML not starting properly" + formatLineNumber()); } /** * Write the VTD and LC into their storage container for where LC depth is 5. * * @param offset * int * @param length * int * @param token_type * int * @param depth * int */ private void writeVTD(final int offset, final int length, final int token_type, final int depth) { VTDBuffer.append(((long) ((token_type << 28) | ((depth & 0xff) << 20) | length) << 32) | offset); switch (depth) { case 0: rootIndex = VTDBuffer.size - 1; break; case 1: if (last_depth == 1) { l1Buffer.append(((long) last_l1_index << 32) | 0xffffffffL); } else if (last_depth == 2) { l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL); } last_l1_index = VTDBuffer.size - 1; last_depth = 1; break; case 2: if (last_depth == 1) { l1Buffer.append(((long) last_l1_index << 32) + l2Buffer.size); } else if (last_depth == 2) { l2Buffer.append(((long) last_l2_index << 32) | 0xffffffffL); } last_l2_index = VTDBuffer.size - 1; last_depth = 2; break; case 3: l3Buffer.append(VTDBuffer.size - 1); if (last_depth == 2) { l2Buffer.append(((long) last_l2_index << 32) + l3Buffer.size - 1); } last_depth = 3; break; default: // rootIndex = VTDBuffer.size() - 1; } } private void _writeVTD(final int offset, final int length, final int token_type, final int depth) { VTDBuffer.append(((long) ((token_type << 28) | ((depth & 0xff) << 20) | length) << 32) | offset); } private void writeVTDText(final int offset, final int length, final int token_type, final int depth) { if (length > MAX_TOKEN_LENGTH) { int k; int r_offset = offset; for (k = length; k > MAX_TOKEN_LENGTH; k = k - MAX_TOKEN_LENGTH) { VTDBuffer.append(((long) ((token_type << 28) | ((depth & 0xff) << 20) | MAX_TOKEN_LENGTH) << 32) | r_offset); r_offset += MAX_TOKEN_LENGTH; } VTDBuffer.append(((long) ((token_type << 28) | ((depth & 0xff) << 20) | k) << 32) | r_offset); } else { VTDBuffer.append(((long) ((token_type << 28) | ((depth & 0xff) << 20) | length) << 32) | offset); } } /** * * @throws ParseException */ private void qualifyElement() throws ParseException { int i = nsBuffer3.size - 1; // two cases: // 1. the current element has no prefix, look for xmlns // 2. the current element has prefix, look for xmlns:something final int preLen = (int) ((currentElementRecord & 0xffff000000000000L) >> 48); final int preOs = (int) currentElementRecord; while (i >= 0) { final int t = nsBuffer3.upper32At(i); // with prefix, get full length and prefix length if ((t & 0xffff) - (t >> 16) == preLen) { // doing byte comparison here final int os = nsBuffer3.lower32At(i) + (t >> 16) + increment; int k = 0; for (; k < preLen - increment; k++) { if (XMLDoc[os + k] != XMLDoc[preOs + k]) { break; } } if (k == preLen - increment) { return; // found the match } } /* * if ( (nsBuffer3.upper32At(i) & 0xffff0000) == 0){ * return; * } */ i--; } // no need to check if xml is the prefix if (checkPrefix(preOs, preLen)) { return; } // print line # column# and full element name throw new ParseException("Name space qualification Exception: Element not qualified\n" + formatLineNumber((int) currentElementRecord)); } private boolean checkPrefix(final int os, final int len) { // int i=0; if (encoding < FORMAT_UTF_16BE) { if (len == 4 && XMLDoc[os] == 'x' && XMLDoc[os + 1] == 'm' && XMLDoc[os + 2] == 'l') { return true; } } else if (encoding == FORMAT_UTF_16BE) { if (len == 8 && XMLDoc[os] == 0 && XMLDoc[os + 1] == 'x' && XMLDoc[os + 2] == 0 && XMLDoc[os + 3] == 'm' && XMLDoc[os + 4] == 0 && XMLDoc[os + 5] == 'l') { return true; } } else { if (len == 8 && XMLDoc[os] == 'x' && XMLDoc[os + 1] == 0 && XMLDoc[os + 2] == 'm' && XMLDoc[os + 3] == 0 && XMLDoc[os + 4] == 'l' && XMLDoc[os + 5] == 0) { return true; } } return false; } private boolean checkPrefix2(final int os, final int len) { // int i=0; if (encoding < FORMAT_UTF_16BE) { if (len == 5 && XMLDoc[os] == 'x' && XMLDoc[os + 1] == 'm' && XMLDoc[os + 2] == 'l' && XMLDoc[os + 3] == 'n' && XMLDoc[os + 4] == 's') { return true; } } else if (encoding == FORMAT_UTF_16BE) { if (len == 10 && XMLDoc[os] == 0 && XMLDoc[os + 1] == 'x' && XMLDoc[os + 2] == 0 && XMLDoc[os + 3] == 'm' && XMLDoc[os + 4] == 0 && XMLDoc[os + 5] == 'l' && XMLDoc[os + 6] == 0 && XMLDoc[os + 7] == 'n' && XMLDoc[os + 8] == 0 && XMLDoc[os + 9] == 's') { return true; } } else { if (len == 10 && XMLDoc[os] == 'x' && XMLDoc[os + 1] == 0 && XMLDoc[os + 2] == 'm' && XMLDoc[os + 3] == 0 && XMLDoc[os + 4] == 'l' && XMLDoc[os + 5] == 0 && XMLDoc[os + 6] == 'n' && XMLDoc[os + 3] == 0 && XMLDoc[os + 8] == 's' && XMLDoc[os + 5] == 0) { return true; } } return false; } private long _getCharResolved(int byte_offset) { int ch = 0; int val = 0; long inc = 2 << (increment - 1); final long l = r._getChar(byte_offset); ch = (int) l; if (ch != '&') { return l; } // let us handle references here // currentOffset++; byte_offset += increment; ch = getCharUnit(byte_offset); byte_offset += increment; switch (ch) { case '#': ch = getCharUnit(byte_offset); if (ch == 'x') { while (true) { byte_offset += increment; inc += increment; ch = getCharUnit(byte_offset); if (ch >= '0' && ch <= '9') { val = (val << 4) + (ch - '0'); } else if (ch >= 'a' && ch <= 'f') { val = (val << 4) + (ch - 'a' + 10); } else if (ch >= 'A' && ch <= 'F') { val = (val << 4) + (ch - 'A' + 10); } else if (ch == ';') { inc += increment; break; } } } else { while (true) { ch = getCharUnit(byte_offset); byte_offset += increment; inc += increment; if (ch >= '0' && ch <= '9') { val = val * 10 + (ch - '0'); } else if (ch == ';') { break; } } } break; case 'a': ch = getCharUnit(byte_offset); if (encoding < FORMAT_UTF_16BE) { if (ch == 'm') { if (getCharUnit(byte_offset + 1) == 'p' && getCharUnit(byte_offset + 2) == ';') { inc = 5; val = '&'; } } else if (ch == 'p') { if (getCharUnit(byte_offset + 1) == 'o' && getCharUnit(byte_offset + 2) == 's' && getCharUnit(byte_offset + 3) == ';') { inc = 6; val = '\''; } } } else { if (ch == 'm') { if (getCharUnit(byte_offset + 2) == 'p' && getCharUnit(byte_offset + 4) == ';') { inc = 10; val = '&'; } } else if (ch == 'p') { if (getCharUnit(byte_offset + 2) == 'o' && getCharUnit(byte_offset + 4) == 's' && getCharUnit(byte_offset + 6) == ';') { inc = 12; val = '\''; } } } break; case 'q': if (encoding < FORMAT_UTF_16BE) { if (getCharUnit(byte_offset) == 'u' && getCharUnit(byte_offset + 1) == 'o' && getCharUnit(byte_offset + 2) == 't' && getCharUnit(byte_offset + 3) == ';') { inc = 6; val = '\"'; } } else { if (getCharUnit(byte_offset) == 'u' && getCharUnit(byte_offset + 2) == 'o' && getCharUnit(byte_offset + 4) == 't' && getCharUnit(byte_offset + 6) == ';') { inc = 12; val = '\"'; } } break; case 'l': if (encoding < FORMAT_UTF_16BE) { if (getCharUnit(byte_offset) == 't' && getCharUnit(byte_offset + 1) == ';') { // offset += 2; inc = 4; val = '<'; } } else { if (getCharUnit(byte_offset) == 't' && getCharUnit(byte_offset + 2) == ';') { // offset += 2; inc = 8; val = '<'; } } break; case 'g': if (encoding < FORMAT_UTF_16BE) { if (getCharUnit(byte_offset) == 't' && getCharUnit(byte_offset + 1) == ';') { inc = 4; val = '>'; } } else { if (getCharUnit(byte_offset) == 't' && getCharUnit(byte_offset + 2) == ';') { inc = 8; val = '>'; } } break; } // currentOffset++; return val | (inc << 32); } // return 0; private int getCharUnit(final int byte_offset) { return (encoding <= 2) ? XMLDoc[byte_offset] & 0xff : (encoding < FORMAT_UTF_16BE) ? r.decode(byte_offset) : (encoding == FORMAT_UTF_16BE) ? ((XMLDoc[byte_offset]) << 8 | XMLDoc[byte_offset + 1]) : ((XMLDoc[byte_offset + 1]) << 8 | XMLDoc[byte_offset]); } private boolean matchURL(final int bos1, final int len1, final int bos2, final int len2) { long l1, l2; int i1 = bos1, i2 = bos2; final int i3 = bos1 + len1, i4 = bos2 + len2; // System.out.println("--->"+new String(XMLDoc, bos1, len1)+" "+new String(XMLDoc,bos2,len2)); while (i1 < i3 && i2 < i4) { l1 = _getCharResolved(i1); l2 = _getCharResolved(i2); if ((int) l1 != (int) l2) { return false; } i1 += (int) (l1 >> 32); i2 += (int) (l2 >> 32); } if (i1 == i3 && i2 == i4) { return true; } return false; } private void checkAttributeUniqueness() throws ParseException { boolean unique = true; boolean unequal; for (int i = 0; i < attr_count; i++) { unequal = false; final int prevLen = (int) attr_name_array[i]; if (length1 == prevLen) { final int prevOffset = (int) (attr_name_array[i] >> 32); for (int j = 0; j < prevLen; j++) { if (XMLDoc[prevOffset + j] != XMLDoc[temp_offset + j]) { unequal = true; break; } } } else { unequal = true; } unique = unique && unequal; } if (!unique && attr_count != 0) { throw new ParseException("Error in attr: Attr name not unique" + formatLineNumber()); } unique = true; if (attr_count < attr_name_array.length) { attr_name_array[attr_count] = ((long) (temp_offset) << 32) | length1; attr_count++; } else // grow the attr_name_array by 16 { final long[] temp_array = attr_name_array; /* * System.out.println( * "size increase from " * + temp_array.length * + " to " * + (attr_count + 16)); */ attr_name_array = new long[attr_count + ATTR_NAME_ARRAY_SIZE]; System.arraycopy(temp_array, 0, attr_name_array, 0, attr_count); /* * for (int i = 0; i < attr_count; i++) { * attr_name_array[i] = temp_array[i]; * } */ attr_name_array[attr_count] = ((long) (temp_offset) << 32) | length1; attr_count++; } // insert prefix attr node into the prefixed_attr_name array // xml:something will not be inserted // System.out.println(" prefixed attr count ===>"+prefixed_attr_count); // System.out.println(" length2 ===>"+length2); if (ns && !is_ns && length2 != 0) { if ((increment == 1 && length2 == 3 && matchXML(temp_offset)) || (increment == 2 && length2 == 6 && matchXML(temp_offset))) { return; } else if (prefixed_attr_count < prefixed_attr_name_array.length) { prefixed_attr_name_array[prefixed_attr_count] = ((long) (temp_offset) << 32) | (length2 << 16) | length1; prefixed_attr_count++; } else { final long[] temp_array1 = prefixed_attr_name_array; prefixed_attr_name_array = new long[prefixed_attr_count + ATTR_NAME_ARRAY_SIZE]; prefix_URL_array = new int[prefixed_attr_count + ATTR_NAME_ARRAY_SIZE]; System.arraycopy(temp_array1, 0, prefixed_attr_name_array, 0, prefixed_attr_count); // System.arraycopy(temp_array1, 0, prefixed_attr_val_array, 0, prefixed_attr_count) /* * for (int i = 0; i < attr_count; i++) { * attr_name_array[i] = temp_array[i]; * } */ prefixed_attr_name_array[prefixed_attr_count] = ((long) (temp_offset) << 32) | (length2 << 16) | length1; prefixed_attr_count++; } } } private void handleOtherTextChar(final int ch) throws ParseException { if (ch == '&') { final int startOfEntityBody = offset; final int entity = entityIdentifier(); if (!XMLChar.isValidChar(entity)) { throw new ParseException("Error in text content: Invalid char in text content " + formatLineNumber()); } XMLDoc[startOfEntityBody - 1] = (char) entity; for (int i = startOfEntityBody; i < offset; i++) { XMLDoc[i] = 0; } } else if (ch == ']') { if (r.skipChar(']')) { while (r.skipChar(']')) { } if (r.skipChar('>')) { throw new ParseException("Error in text content: ]]> in text content" + formatLineNumber()); } } } else { throw new ParseException("Error in text content: Invalid char in text content " + formatLineNumber()); } } private void handleOtherTextChar2(final int ch) throws ParseException { if (ch == '&') { final int startOfEntityBody = offset; final int entity = entityIdentifier(); XMLDoc[startOfEntityBody - 1] = (char) entity; for (int i = startOfEntityBody; i < offset; i++) { XMLDoc[i] = 0; } } else if (ch == ']') { if (r.skipChar(']')) { while (r.skipChar(']')) { } if (r.skipChar('>')) { throw new ParseException("Error in text content: ]]> in text content" + formatLineNumber()); } } // parser_state = STATE_TEXT; } else { throw new ParseException("Error in text content: Invalid char" + formatLineNumber()); } } }