import org.junit.After; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.nio.CharBuffer; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexTestHarness { ///folio/FolioTokenReader.java public static String CommentRegex = "<CM>(.*?)</CM>"; //Try putting the ^ back in... And calling .region(index, end) //instead if passing in index to the find() method. Leave the find() method blank, but re-create the matchers each time... public static String RegexPrefix = "^"; /** * Matches a comment tag and any intermediate comments. Lazy, of course. */ private static Pattern rComment = Pattern.compile(RegexPrefix + CommentRegex, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); public static String TextRegex = "(?:[^<]+|<[^A-Za-z/])+"; /** * Matches text that doesn't contain any open brackets that are directly * followed by a letter or a closing slash. */ private static Pattern rText = Pattern.compile(RegexPrefix + TextRegex); // non <, // expect // doubles /** * Matches any two-letter tag (and +/-), and captures (optional) options. * group 1 and 2, respectively. Tag options must have matching quote pairs, * (single quotes are encoded like ""). Opening brackets can be entered by * entering two. Opening and closing brackets can be used literally as long * as they exist in pairs, are not nested, and don't contain quotes. Opening * and closing brackets can be used arbitrarily within a quoted string. */ public static String TagRegex = "<(/)?([A-Z-a-z][A-Za-z][\\+\\-]?)(?:\\s*[:,;]+\\s*((?:[^><\"]+|<<|\"(?:[^\"]|(?:\"\"))*\"|<[^<>\"]*>)+?))?>"; private static Pattern rTag = Pattern.compile(RegexPrefix + TagRegex); /** * An array of the patterns we look for, in the correct order. */ private static Pattern[] tokenPatterns = new Pattern[]{rText, rComment, rTag}; // rComment should come before rTag, since rTag matches // opening comment tags. protected Pattern[] getTokenPatterns() { return tokenPatterns; } @Test public void testCharBuffer() throws Exception { testRegex(cbData, 1); } @Test public void testStringBuffer() throws Exception { testRegex(sbData, 1); } @Test public void testStringBuilder() throws Exception { testRegex(sbuData, 1); } @AfterClass public static void tearDownAfterClass() throws Exception { } private static void testRegex(CharSequence text, int iterations) { // start time int index = 0; long start = System.nanoTime(); Matcher[] matchers = new Matcher[tokenPatterns.length]; for (int i = 0; i < tokenPatterns.length; i++) { matchers[i] = tokenPatterns[i].matcher(text); } for (int i = 0; i < iterations; i++) { index = 0;// reset window boolean found = false; do { found = false; for (int j = 0; j < matchers.length; j++) { Matcher m = matchers[j]; m.reset(); m = m.region(index, text.length()); if (m.find()) { index = m.end(); found = true; //System.out.println(m.group()); break; } } } while (found); if (index != text.length()) { if (index < 0) { throw new Error(); } CharSequence next = null; if (index + 51 > text.length()) next = text.subSequence(index, text.length()); else next = text.subSequence(index, index + 50); // throw new Error("Failed to parse token at " + next); } } // end time long end = System.nanoTime(); System.err.println(end - start + "| Time Elapsed in milliseconds for " + text.getClass().getSimpleName() + " implementation"); } private static CharBuffer cbData; private static StringBuffer sbData; private static StringBuilder sbuData; @BeforeClass public static void setUp() throws Exception { // Here's the actual code used to perform the matching at a certain // character index. // filter(pFilter, filename, printOutput); cbData = useCharBuffer(); sbData = useStringBuffer(); sbuData = useStringBuilder(); } @After public void tearDown() throws Exception { } private static CharBuffer useCharBuffer() throws IOException { BufferedReader reader = new BufferedReader(new FileReader( folioxml.config.TestConfig.getFolioHlp().getFlatFilePath()); char[] whole = new char[0]; //bug if file < 8192 chars char[] chars = new char[32000]; int numRead = 0; while ((numRead = reader.read(chars)) > -1) { char[] newWhole = new char[whole.length + numRead]; System.arraycopy(whole, 0, newWhole, 0, whole.length); System.arraycopy(chars, 0, newWhole, whole.length, numRead); whole = newWhole; } reader.close(); return CharBuffer.wrap(whole); } private static StringBuffer useStringBuffer() throws IOException { BufferedReader reader = new BufferedReader(new FileReader( folioxml.config.TestConfig.getFolioHlp().getFlatFilePath()); StringBuffer sb = new StringBuffer(8192); char[] chars = new char[8192]; int numRead = 0; while ((numRead = reader.read(chars)) > -1) { sb.append(String.valueOf(chars, 0, numRead)); } reader.close(); return sb; } private static StringBuilder useStringBuilder() throws IOException { BufferedReader reader = new BufferedReader(new FileReader( folioxml.config.TestConfig.getFolioHlp().getFlatFilePath()); StringBuilder sb = new StringBuilder(8192); char[] chars = new char[8192]; int numRead = 0; while ((numRead = reader.read(chars)) > -1) { sb.append(String.valueOf(chars, 0, numRead)); } reader.close(); return sb; } }