package test; import java.io.*; public final class TestScannerPerf { final static int INT_AMP = '&'; final static int INT_LT = '<'; final static int INT_RBRACKET = ']'; final static int INT_SPACE = ' '; final static int INT_TAB = '\t'; final static int INT_CR = '\r'; final static int INT_LF = '\n'; final static byte BYTE_LF = (byte) '\n'; final static byte BYTE_NULL = (byte)0; final int mRepCount; int mTmpChar = 0; final byte[] mData; final byte[] mInputBuffer = new byte[4000]; final char[] mOutputBuffer = new char[2000]; final static int MB_CODE_BASE = 5; final static int[] CHAR_TYPES = new int[256]; static { int code; for (int i = 128; i < 256; ++i) { int c = i; if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) code = MB_CODE_BASE + 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) code = MB_CODE_BASE + 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... code = MB_CODE_BASE + 3; } else { code = 1; } CHAR_TYPES[c] = code; } for (int i = 0; i < 32; ++i) { CHAR_TYPES[i] = 1; // invalid white space } CHAR_TYPES['\r'] = 2; CHAR_TYPES['\n'] = 2; CHAR_TYPES['\t'] = 0; // no processing needed CHAR_TYPES['<'] = 3; CHAR_TYPES['&'] = 4; CHAR_TYPES[']'] = 5; } InputStream mIn; int mLineNr; int mByteCount; int mTagCount; int mEntityCount; int mBracketCount; int mInputPtr; int mInputLen; int mTmpType = 0; public TestScannerPerf(byte[] data, int repCount) { mData = data; mRepCount = repCount; } public void test() throws IOException { int round = 0; mIn = new ByteArrayInputStream(mData); for (; true; ++round) { long now = System.currentTimeMillis(); String msg = "[null]"; int total = 0; final int TYPES = 3; if ((round % TYPES) == 0) { System.out.println(); } for (int i = 0; i < mRepCount; ++i) { mIn.reset(); mLineNr = 0; mTagCount = 0; mByteCount = 0; switch (round % TYPES) { case 0: msg = "[Scanner-code]"; total += testScannerCode(); break; case 1: msg = "[Scanner-int-arr]"; total += testScannerInts(); break; case 2: msg = "[Scanner-int-arr2]"; total += testScannerInts2(); break; default: throw new Error("Unexpected round, #"+i); } } now = System.currentTimeMillis() - now; System.out.println(msg+" -> "+now+" msecs (total "+total +", byte count 0x"+Integer.toHexString(mByteCount)+")"); try { Thread.sleep(200L); } catch (Exception e) { } System.gc(); try { Thread.sleep(200L); } catch (Exception e) { } } } private int testScannerCode() throws IOException { final char[] outBuf = mOutputBuffer; int outPtr = 0; int c = 0; mInputLen = 0; mInputPtr = 0; main_loop: while (true) { // Next thing: let's get the first byte: int ptr = mInputPtr; ascii_loop: while (true) { if (ptr >= mInputLen) { if (!loadMoreBytes()) { break main_loop; } ptr = mInputPtr; } c = (int) mInputBuffer[ptr++]; if (c <= INT_RBRACKET) { //if (c <= INT_LT) { if (c < 0) { break ascii_loop; } if (c < INT_SPACE) { if (c == INT_CR) { ++mLineNr; } else if (c == INT_LF) { ++mLineNr; } else if (c != INT_TAB) { throw new Error(); } } else if (c == INT_LT) { ++mTagCount; } else if (c == INT_AMP) { ++mEntityCount; } else if (c == INT_RBRACKET) { ++mBracketCount; } } // !!! TODO: xml1.1, 0x7F? if (outPtr >= outBuf.length) { outPtr = 0; } outBuf[outPtr++] = (char) c; } c = decodeMultiByteChar(c, ptr); if (c < 0) { // surrogate pair if (outPtr >= outBuf.length) { outPtr = 0; } c = -c; // Let's add first part right away: outBuf[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); // And let the other char output in general loop } if (outPtr >= outBuf.length) { outPtr = 0; } outBuf[outPtr++] = (char) c; } return mByteCount; } private int testScannerInts() throws IOException { int outPtr = 0; int c = 0; final int[] TYPES = CHAR_TYPES; final byte[] inputBuffer = mInputBuffer; final char[] outputBuffer = mOutputBuffer; mInputLen = 0; mInputPtr = 0; main_loop: while (true) { // Next thing: let's get the first byte: int ptr = mInputPtr; ascii_loop: while (true) { if (ptr >= mInputLen) { if (!loadMoreBytes()) { break main_loop; } ptr = mInputPtr; } c = (int) inputBuffer[ptr++] & 0xFF; int type = TYPES[c]; if (type != 0) { switch (type) { case 1: throw new Error("Invalid white space"); case 2: if (c == INT_CR) { ++mLineNr; } else if (c == INT_LF) { ++mLineNr; } break; case 3: ++mTagCount; break; case 4: ++mEntityCount; break; case 5: ++mBracketCount; break; case 6: // 2 bytes case 7: // 3 bytes case 8: // 4 bytes break ascii_loop; default: throw new Error(); } } if (outPtr >= outputBuffer.length) { outPtr = 0; } outputBuffer[outPtr++] = (char) c; } c = decodeMultiByteChar(c, ptr); if (c < 0) { // surrogate pair if (outPtr >= outputBuffer.length) { outPtr = 0; } c = -c; // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); // And let the other char output in general loop } if (outPtr >= outputBuffer.length) { outPtr = 0; } outputBuffer[outPtr++] = (char) c; } return mByteCount; } private int testScannerInts2() throws IOException { int outPtr = 0; int c = 0; final int[] TYPES = CHAR_TYPES; final byte[] inputBuffer = mInputBuffer; char[] outputBuffer = mOutputBuffer; mInputLen = 0; mInputPtr = 0; main_loop: while (true) { // Next thing: let's get the first byte: ascii_loop: while (true) { int ptr = mInputPtr; if (ptr >= mInputLen) { if (!loadMoreBytes()) { break main_loop; } ptr = mInputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = mOutputBuffer; outPtr = 0; } int max = mInputLen; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { mInputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } mInputPtr = ptr; } switch (TYPES[c]) { case 1: throw new Error("Invalid white space"); case 2: if (c == INT_CR) { ++mLineNr; } else if (c == INT_LF) { ++mLineNr; } break; case 3: ++mTagCount; break; case 4: // should expand entity ++mEntityCount; break; case 5: ++mBracketCount; break; case 6: // 2 bytes c = decodeMultiByteChar(c, mInputPtr); break; case 7: // 3 bytes c = decodeMultiByteChar(c, mInputPtr); break; case 8: // 4 bytes { c = decodeMultiByteChar(c, mInputPtr); if (outPtr >= outputBuffer.length) { outputBuffer = mOutputBuffer; outPtr = 0; } outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } break; default: throw new Error(); } if (outPtr >= outputBuffer.length) { outputBuffer = mOutputBuffer; outPtr = 0; } outputBuffer[outPtr++] = (char) c; } return mByteCount; } /* private final int decode(int ptr, int c, int type) throws IOException { switch (type) { case 1: throw new Error("Invalid white space"); case 2: if (c == INT_CR) { ++mLineNr; } else if (c == INT_LF) { ++mLineNr; } break; case 3: ++mTagCount; break; case 4: // should expand entity ++mEntityCount; break; case 5: ++mBracketCount; break; case 6: // 2 bytes case 7: // 3 bytes case 8: // 4 bytes c = decodeMultiByteChar(c, ptr); break; default: throw new Error(); } mInputPtr = ptr; return c; } */ private final boolean loadMoreBytes() throws IOException { mByteCount += mInputLen; mInputPtr = 0; int count = mIn.read(mInputBuffer); if (count < 0) { mInputLen = 0; return false; } mInputLen = count; return true; } private final void loadMoreBytesGuaranteed() throws IOException { if (!loadMoreBytes()) { throw new Error(); } } /* private final void markLF() { ++mLineNr; } private final void markLF(int pos) { ++mLineNr; } private final int handleEntityInText() { ++mEntityCount; return '&'; } */ private final int decodeMultiByteChar(int c, int ptr) throws IOException { int needed; if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) c &= 0x1F; needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) c &= 0x0F; needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... c &= 0x07; needed = 3; } else { throw new Error("Unexpected multi-byte first byte 0x"+Integer.toHexString(c)); } if (ptr >= mInputLen) { // 2nd byte loadMoreBytesGuaranteed(); ptr = mInputPtr; } int d = (int) mInputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { throw new Error(); } c = (c << 6) | (d & 0x3F); if (needed > 1) { // needed == 1 means 2 bytes total if (ptr >= mInputLen) { loadMoreBytesGuaranteed(); ptr = mInputPtr; } d = (int) mInputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { throw new Error(); } c = (c << 6) | (d & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates) if (ptr >= mInputLen) { loadMoreBytesGuaranteed(); ptr = mInputPtr; } d = (int) mInputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { throw new Error(); } c = (c << 6) | (d & 0x3F); /* Need to signal such pair differently (to make comparison * easier) */ return -c; } } mInputPtr = ptr; return c; } /* private final int decodeMultiByteChar(int c, int type, int ptr) throws IOException { // let's see how many add'l bytes are needed type -= MB_CODE_BASE; c &= (0x3F >> type); // 1f/0f/07 (for 2/3/4 bytes) if (ptr >= mInputEnd) { // 2nd byte loadMoreBytesGuaranteed(); ptr = mInputPtr; } int d = (int) mInputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { throw new Error(); } c = (c << 6) | (d & 0x3F); if (type > 1) { // needed == 1 means 2 bytes total if (ptr >= mInputEnd) { loadMoreBytesGuaranteed(); ptr = mInputPtr; } d = (int) mInputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { throw new Error(); } c = (c << 6) | (d & 0x3F); if (type > 2) { // 4 bytes? (need surrogates) if (ptr >= mInputEnd) { loadMoreBytesGuaranteed(); ptr = mInputPtr; } d = (int) mInputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { throw new Error(); } c = (c << 6) | (d & 0x3F); // Need to signal such pair differently (to make comparison // easier) return -c; } } mInputPtr = ptr; return c; } */ private static byte[] readData(File f) throws IOException { int len = (int) f.length(); byte[] data = new byte[len]; int offset = 0; FileInputStream fis = new FileInputStream(f); while (len > 0) { int count = fis.read(data, offset, len-offset); offset += count; len -= count; } fis.close(); return data; } public static void main(String[] args) throws IOException { if (args.length != 1) { System.err.println("Usage: java ... [input file]"); System.exit(1); } byte[] data = readData(new File(args[0])); int len = data.length; int repCount = 1; int THRESHOLD = 10 * 1000 * 1000; if (len < THRESHOLD) { repCount = (THRESHOLD / len); } //if (repCount > 2) { repCount /= 2; } System.out.println("Ok, read in test data, "+len+" bytes; using "+repCount+" repetitions"); new TestScannerPerf(data, repCount).test(); } }