package com.fulmicoton.multiregexp; import org.junit.Assert; import org.junit.Test; import java.util.Iterator; public class TokenizerTest { private static enum TOKEN { WHITESPACE, WORD, PUNCTUATION; } private static enum TEST { A, ABC, BCD, ABD, D; } @Test public void testSimpleLexer() { final Lexer<TOKEN> lexer = new Lexer<>(); lexer .addRule(TOKEN.WHITESPACE, " ") .addRule(TOKEN.WORD, "[a-zA-Z]+") .addRule(TOKEN.PUNCTUATION, "[,\\.\\!\\?]"); final String txt = "Bonjour herve!"; final Iterator<Token<TOKEN>> tokenIt = lexer.scan(txt).iterator(); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TOKEN.WORD, "Bonjour")); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TOKEN.WHITESPACE, " ")); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TOKEN.WORD, "herve")); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TOKEN.PUNCTUATION, "!")); Assert.assertFalse(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), null); } @Test public void testPriority() { final Lexer<TEST> lexer = new Lexer<>(); lexer .addRule(TEST.ABC, "abc") .addRule(TEST.A, "a") .addRule(TEST.ABD, "abd") .addRule(TEST.D, "b?d") .addRule(TEST.BCD, "bcd"); { final String txt = "abd"; final Iterator<Token<TEST>> tokenIt = lexer.scan(txt).iterator(); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TEST.A, "a")); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TEST.D, "bd")); Assert.assertFalse(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), null); } { final String txt = "abcd"; final Iterator<Token<TEST>> tokenIt = lexer.scan(txt).iterator(); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TEST.ABC, "abc")); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TEST.D, "d")); Assert.assertFalse(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), null); } { final String txt = "abce"; try { final Iterator<Token<TEST>> tokenIt = lexer.scan(txt).iterator(); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TEST.ABC, "abc")); Assert.assertTrue(tokenIt.hasNext()); Assert.assertEquals(tokenIt.next(), new Token<>(TEST.D, "d")); } catch (RuntimeException e) { final ScanException typedError = (ScanException)e.getCause(); Assert.assertEquals(typedError.getOffset(), 3); Assert.assertEquals(typedError.getMessage(), "Could not find any token at (3):\"abc|e\""); } } } }