package test; import java.io.*; import javax.xml.stream.*; import com.fasterxml.aalto.in.*; import com.fasterxml.aalto.util.*; public final class TestPNamePerf { final static int INT_A = 'A'; final int mRepCount; int mTmpChar = 0; final byte[] mInputBuffer; final ByteBasedPNameTable mSymbols; final XmlCharTypes mCharTypes; int mInputPtr; int mInputLen; protected int[] mQuadBuffer = new int[64]; protected char[] mNameBuffer = new char[100]; public TestPNamePerf(byte[] data, int repCount) { mInputBuffer = data; mInputLen = data.length; mRepCount = repCount; ReaderConfig cfg = new ReaderConfig(); cfg.setActualEncoding(CharsetNames.CS_UTF8); mSymbols = cfg.getBBSymbols(); mCharTypes = cfg.getCharTypes(); } public void test() throws IOException, XMLStreamException { int round = 0; for (; true; ++round) { String msg = "[null]"; int total = 0; final int TYPES = 3; long now = System.currentTimeMillis(); //switch (round % TYPES) { switch (0) { case 1: msg = "[Regular]"; total = testRegularA(); break; case 2: msg = "[New]"; total = testNewA(); break; case 0: msg = "[New/2]"; total = testNew2A(); break; default: throw new Error("Unexpected round, #"+round); } now = System.currentTimeMillis() - now; System.out.println(msg+" -> "+now+" msecs (total "+total+")"); if ((round % TYPES) == 0) { System.out.println(); } try { Thread.sleep(200L); } catch (Exception e) { } System.gc(); try { Thread.sleep(200L); } catch (Exception e) { } } } private int testRegularA() throws IOException, XMLStreamException { int total = 0; for (int i = 0; i < mRepCount; ++i) { mInputPtr = 0; total += testRegular(); } return total; } private int testNewA() throws IOException, XMLStreamException { int total = 0; for (int i = 0; i < mRepCount; ++i) { mInputPtr = 0; total += testNew(); } return total; } private int testNew2A() throws IOException, XMLStreamException { int total = 0; for (int i = 0; i < mRepCount; ++i) { mInputPtr = 0; total += testNew2(); } return total; } private int testRegular() throws IOException, XMLStreamException { ByteBasedPName name = null; int count = 0; while (mInputPtr < mInputLen) { byte b = mInputBuffer[mInputPtr++]; int ch = (int) b & 0xFF; /* We'll skip all intervening chars that can't start a name, * including white space */ if (ch >= INT_A) { name = parsePName(b); count += name.sizeInQuads(); } } return count + name.sizeInQuads(); } private int testNew() throws IOException, XMLStreamException { ByteBasedPName name = null; int count = 0; while (mInputPtr < mInputLen) { byte b = mInputBuffer[mInputPtr++]; int ch = (int) b & 0xFF; /* We'll skip all intervening chars that can't start a name, * including white space */ if (ch >= INT_A) { name = parsePNameNew(b); count += name.sizeInQuads(); } } return count + name.sizeInQuads(); } private int testNew2() throws IOException, XMLStreamException { ByteBasedPName name = null; int count = 0; while (mInputPtr < mInputLen) { byte b = mInputBuffer[mInputPtr++]; int ch = (int) b & 0xFF; /* We'll skip all intervening chars that can't start a name, * including white space */ if (ch >= INT_A) { name = parsePNameNew2(b); count += name.sizeInQuads(); } } return count + name.sizeInQuads(); } protected ByteBasedPName parsePName(byte b) throws XMLStreamException { int q = b & 0xFF; if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode reportError("; expected a name start character"); } int[] quads = mQuadBuffer; int qix = 0; int firstQuad = 0; while (true) { // Second byte if (mInputPtr >= mInputLen) { loadMoreGuaranteed(); } int i2 = mInputBuffer[mInputPtr++] & 0xFF; /* For other bytes beyond first we have to do bit more complicated * check, to reliably find out where name ends. Still can do quite * simple checks though */ if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { // End of name, a single ascii char? return findPName(q, 1, firstQuad, qix, quads); } } // 3rd byte: q = (q << 8) | i2; i2 = (int) ((mInputPtr < mInputLen) ? mInputBuffer[mInputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, 2, firstQuad, qix, quads); } } // 4th byte: q = (q << 8) | i2; i2 = (int) ((mInputPtr < mInputLen) ? mInputBuffer[mInputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, 3, firstQuad, qix, quads); } } q = (q << 8) | i2; i2 = (int) ((mInputPtr < mInputLen) ? mInputBuffer[mInputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, 4, firstQuad, qix, quads); } } if (qix == 0) { // not yet, was the first quad firstQuad = q; } else if (qix == 1) { // second quad, need to init buffer quads[0] = firstQuad; quads[1] = q; } else { // 3rd or after... need to make sure there's room if (qix >= quads.length) { // let's just double? mQuadBuffer = quads = DataUtil.growArrayBy(quads, quads.length); } quads[qix] = q; } ++qix; q = i2; } } protected ByteBasedPName parsePNameNew(byte b) throws XMLStreamException { // First: can we optimize out bounds checks? if ((mInputLen - mInputPtr) < 8) { // got 1 byte, but need 7, plus one trailing return parsePName(b); } int q1 = b & 0xFF; if (q1 < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode reportError("; expected a name start character"); } // If so, can also unroll loops nicely int i2 = mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { return findPName(q1, 1); } } q1 = (q1 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q1, 2); } } q1 = (q1 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name? return findPName(q1, 3); } } q1 = (q1 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name? return findPName(q1, 4); } } // Ok, so far so good; one quad, one byte. Then the second int q2 = i2; i2 = mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { return findPName(q1, q2, 1); } } q2 = (q2 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q1, q2, 2); } } q2 = (q2 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name? return findPName(q1, q2, 3); } } q2 = (q2 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name? return findPName(q1, q2, 4); } } // Ok, no, longer loop. Let's offline int[] quads = mQuadBuffer; quads[0] = q1; quads[1] = q2; return parsePNameNewLong(i2, quads); } protected ByteBasedPName parsePNameNew2(byte b) throws XMLStreamException { // First: can we optimize out bounds checks? if ((mInputLen - mInputPtr) < 8) { // got 1 byte, but need 7, plus one trailing return parsePName(b); } int q1 = b & 0xFF; if (q1 < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode reportError("; expected a name start character"); } // If so, can also unroll loops nicely int i2 = mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { return findPName(q1, 1); } } q1 = (q1 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q1, 2); } } q1 = (q1 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name? return findPName(q1, 3); } } q1 = (q1 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name? return findPName(q1, 4); } } // Longer, let's offline: return parsePNameNewMedium(i2, q1); } protected ByteBasedPName parsePNameNewMedium(int i2, int q1) throws XMLStreamException { // Ok, so far so good; one quad, one byte. Then the second int q2 = i2; i2 = mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { return findPName(q1, q2, 1); } } q2 = (q2 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q1, q2, 2); } } q2 = (q2 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name? return findPName(q1, q2, 3); } } q2 = (q2 << 8) | i2; i2 = (int) mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name? return findPName(q1, q2, 4); } } // Ok, no, longer loop. Let's offline int[] quads = mQuadBuffer; quads[0] = q1; quads[1] = q2; return parsePNameNewLong(i2, quads); } protected ByteBasedPName parsePNameNewLong(int q, int[] quads) throws XMLStreamException { int qix = 2; while (true) { // Second byte of a new quad if (mInputPtr >= mInputLen) { loadMoreGuaranteed(); } int i2 = mInputBuffer[mInputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // End of name, a single ascii char? return findPName(q, quads, qix, 1); } } // 3rd byte: q = (q << 8) | i2; i2 = (int) ((mInputPtr < mInputLen) ? mInputBuffer[mInputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, quads, qix, 2); } } // 4th byte: q = (q << 8) | i2; i2 = (int) ((mInputPtr < mInputLen) ? mInputBuffer[mInputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, quads, qix, 3); } } q = (q << 8) | i2; i2 = (int) ((mInputPtr < mInputLen) ? mInputBuffer[mInputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, quads, qix, 4); } } if (qix >= quads.length) { // let's just double? mQuadBuffer = quads = DataUtil.growArrayBy(quads, quads.length); } quads[qix] = q; ++qix; q = i2; } } private final ByteBasedPName findPName(int onlyQuad, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --mInputPtr; int hash = ByteBasedPNameTable.calcHash(onlyQuad); ByteBasedPName name = mSymbols.findSymbol(hash, onlyQuad, 0); if (name == null) { // Let's simplify things a bit, and just use array based one then: mQuadBuffer[0] = onlyQuad; name = addPName(hash, mQuadBuffer, 1, lastByteCount); } return name; } private final ByteBasedPName findPName(int firstQuad, int secondQuad, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --mInputPtr; int hash = ByteBasedPNameTable.calcHash(firstQuad, secondQuad); ByteBasedPName name = mSymbols.findSymbol(hash, firstQuad, secondQuad); if (name == null) { // Let's just use array, then mQuadBuffer[0] = firstQuad; mQuadBuffer[1] = secondQuad; name = addPName(hash, mQuadBuffer, 2, lastByteCount); } return name; } private final ByteBasedPName findPName(int lastQuad, int[] quads, int qlen, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --mInputPtr; /* Nope, long (3 quads or more). At this point, the last quad is * not yet in the array, let's add: */ if (qlen >= quads.length) { // let's just double? mQuadBuffer = quads = DataUtil.growArrayBy(quads, quads.length); } quads[qlen++] = lastQuad; int hash = ByteBasedPNameTable.calcHash(quads, qlen); ByteBasedPName name = mSymbols.findSymbol(hash, quads, qlen); if (name == null) { name = addPName(hash, quads, qlen, lastByteCount); } return name; } private final ByteBasedPName findPName(int lastQuad, int lastByteCount, int firstQuad, int qlen, int[] quads) throws XMLStreamException { // First, need to push back the byte read but not used: --mInputPtr; // Separate handling for short names: if (qlen <= 1) { // short name? if (qlen == 0) { // 4-bytes or less; only has 'lastQuad' defined int hash = ByteBasedPNameTable.calcHash(lastQuad, 0); ByteBasedPName name = mSymbols.findSymbol(hash, lastQuad, 0); if (name == null) { // Let's simplify things a bit, and just use array based one then: quads = mQuadBuffer; quads[0] = lastQuad; name = addPName(hash, quads, 1, lastByteCount); } return name; } int hash = ByteBasedPNameTable.calcHash(firstQuad, lastQuad); ByteBasedPName name = mSymbols.findSymbol(hash, firstQuad, lastQuad); if (name == null) { // As above, let's just use array, then quads = mQuadBuffer; quads[0] = firstQuad; quads[1] = lastQuad; name = addPName(hash, quads, 2, lastByteCount); } return name; } /* Nope, long (3 quads or more). At this point, the last quad is * not yet in the array, let's add: */ if (qlen >= quads.length) { // let's just double? mQuadBuffer = quads = DataUtil.growArrayBy(quads, quads.length); } quads[qlen++] = lastQuad; int hash = ByteBasedPNameTable.calcHash(quads, qlen); ByteBasedPName name = mSymbols.findSymbol(hash, quads, qlen); if (name == null) { name = addPName(hash, quads, qlen, lastByteCount); } return name; } protected final ByteBasedPName addPName(int hash, int[] quads, int qlen, int lastQuadBytes) throws XMLStreamException { // 4 bytes per quad, except last one maybe less int byteLen = (qlen << 2) - 4 + lastQuadBytes; /* And last one is not correctly aligned (leading zero bytes instead * need to shift a bit, instead of trailing). Only need to shift it * for UTF-8 decoding; need revert for storage (since key will not * be aligned, to optimize lookup speed) */ int lastQuad; if (lastQuadBytes < 4) { lastQuad = quads[qlen-1]; // 8/16/24 bit left shift quads[qlen-1] = (lastQuad << ((4 - lastQuadBytes) << 3)); } else { lastQuad = 0; } // Let's handle first char separately (different validation): int ch = (quads[0] >>> 24); boolean ok; int ix = 1; char[] cbuf = mNameBuffer; int cix = 0; final int[] TYPES = mCharTypes.NAME_CHARS; switch (TYPES[ch]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_NAME_COLON: // not ok as first case XmlCharTypes.CT_NAME_NONFIRST: case InputCharTypes.CT_INPUT_NAME_MB_N: ok = false; break; case XmlCharTypes.CT_NAME_ANY: ok = true; break; default: // multi-byte (UTF-8) chars: { int needed; if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) ch &= 0x1F; needed = 1; } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) ch &= 0x0F; needed = 2; } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... ch &= 0x07; needed = 3; } else { // 5- and 6-byte chars not valid xml chars reportError(ch); needed = ch = 1; // never really gets this far } if ((ix + needed) > byteLen) { reportError(ch); } ix += needed; int q = quads[0]; // Always need at least one more right away: int ch2 = (q >> 16) & 0xFF; if ((ch2 & 0xC0) != 0x080) { reportError(ch2); } ch = (ch << 6) | (ch2 & 0x3F); /* And then may need more. Note: here we do not do all the * checks that UTF-8 text decoder might do. Reason is that * name validity checking methods handle most of such checks */ if (needed > 1) { ch2 = (q >> 8) & 0xFF; if ((ch2 & 0xC0) != 0x080) { reportError(ch2); } ch = (ch << 6) | (ch2 & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates on output) ch2 = q & 0xFF; if ((ch2 & 0xC0) != 0x080) { reportError(ch2 & 0xFF); } ch = (ch << 6) | (ch2 & 0x3F); } } ok = XmlChars.is10NameStartChar(ch); if (needed > 2) { // outside of basic 16-bit range? need surrogates /* so, let's first output first char (high surrogate), * let second be output by later code */ ch -= 0x10000; // to normalize it starting with 0x0 cbuf[cix++] = (char) (0xD800 + (ch >> 10)); ch = (0xDC00 | (ch & 0x03FF)); } } } if (!ok) { // 0 to indicate it's first char, even with surrogates reportError(ch); } cbuf[cix++] = (char) ch; // the only char, or second (low) surrogate /* Whoa! Tons of code for just the start char. But now we get to * decode the name proper, at last! */ int last_colon = -1; for (; ix < byteLen; ) { ch = quads[ix >> 2]; // current quad, need to shift+mask int byteIx = (ix & 3); ch = (ch >> ((3 - byteIx) << 3)) & 0xFF; ++ix; // Ascii? switch (TYPES[ch]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_MULTIBYTE_N: ok = false; break; case XmlCharTypes.CT_NAME_COLON: // not ok as first if (last_colon >= 0) { reportError(0); } last_colon = cix; ok = true; break; case XmlCharTypes.CT_NAME_NONFIRST: case XmlCharTypes.CT_NAME_ANY: ok = true; break; default: { int needed; if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) ch &= 0x1F; needed = 1; } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) ch &= 0x0F; needed = 2; } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... ch &= 0x07; needed = 3; } else { // 5- and 6-byte chars not valid xml chars reportError(ch); needed = ch = 1; // never really gets this far } if ((ix + needed) > byteLen) { reportError(cix); } // Ok, always need at least one more: int ch2 = quads[ix >> 2]; // current quad, need to shift+mask byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { reportError(ch2); } ch = (ch << 6) | (ch2 & 0x3F); // Once again, some of validation deferred to name char validator if (needed > 1) { ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { reportError(ch2); } ch = (ch << 6) | (ch2 & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates on output) ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { reportError(ch2 & 0xFF); } ch = (ch << 6) | (ch2 & 0x3F); } } ok = XmlChars.is10NameChar(ch); if (needed > 2) { // surrogate pair? once again, let's output one here, one later on ch -= 0x10000; // to normalize it starting with 0x0 if (cix >= cbuf.length) { mNameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = (char) (0xD800 + (ch >> 10)); ch = 0xDC00 | (ch & 0x03FF); } } } if (!ok) { reportError(cix); } if (cix >= cbuf.length) { mNameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = (char) ch; } /* Ok. Now we have the character array, and can construct the * String (as well as check proper composition of semicolons * for ns-aware mode...) */ String baseName = new String(cbuf, 0, cix); // And finally, unalign if necessary if (lastQuadBytes < 4) { quads[qlen-1] = lastQuad; } return mSymbols.addSymbol(hash, baseName, last_colon, quads, qlen); } private void loadMoreGuaranteed() { throw new IllegalStateException(); } private int loadOne() { throw new IllegalStateException(); } private void reportError(int arg) { throw new IllegalStateException(); } private void reportError(String msg) { throw new IllegalStateException(msg); } private static byte[] readData(File f) throws IOException { int len = (int) f.length(); byte[] data = new byte[len]; int offset = 0; FileInputStream fis = new FileInputStream(f); while (len > 0) { int count = fis.read(data, offset, len-offset); offset += count; len -= count; } fis.close(); return data; } public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: java ... [input file]"); System.exit(1); } byte[] data = readData(new File(args[0])); int len = data.length; int repCount = 1; int THRESHOLD = 10 * 1000 * 1000; if (len < THRESHOLD) { repCount = (THRESHOLD / len); } //if (repCount > 2) { repCount /= 2; } System.out.println("Ok, read in test data, "+len+" bytes; using "+repCount+" repetitions"); new TestPNamePerf(data, repCount).test(); } }