/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.tokenization; import static org.junit.Assert.assertEquals; import org.junit.Test; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class EnglishTokenizerTest { @Test public void testPeriods() { AbstractTokenizer t = new EnglishTokenizer(); String s, r; s = "500 million of 1986.[11]"; r = "[500, million, of, 1986, ., [, 11, ]]"; assertEquals(r, t.tokenize(s).toString()); s = "injury-related deaths worldwide.[6]"; r = "[injury, -, related, deaths, worldwide, ., [, 6, ]]"; assertEquals(r, t.tokenize(s).toString()); } @Test public void test() { AbstractTokenizer t = new EnglishTokenizer(); String s, r; // white-spaces s = " \n\t\n\r\f"; r = "[]"; assertEquals(r, t.tokenize(s).toString()); s = " A B C\n D \t\nE\r\f "; r = "[A, B, C, D, E]"; assertEquals(r, t.tokenize(s).toString()); // hyperlinks s = "|http://www.clearnlp.com|www.clearnlp.com|mailto:support@clearnlp.com|jinho_choi@clearnlp.com|"; r = "[|, http://www.clearnlp.com, |, www.clearnlp.com, |, mailto:support@clearnlp.com, |, jinho_choi@clearnlp.com, |]"; assertEquals(r, t.tokenize(s).toString()); // emoticons s = ":-))) :---( Hi:)."; r = "[:-))), :---(, Hi, :), .]"; assertEquals(r, t.tokenize(s).toString()); // surrounding symbols s = "---\"((``@#$Choi%&*''))\".?!==="; r = "[---, \", ((, ``, @#$, Choi, %&*, '', )), \", .?!, ===]"; assertEquals(r, t.tokenize(s).toString()); // in-between symbols s = ",,A---C**D~~~~E==F,G,,H..I.J-1.--2-K||L-#3"; r = "[,,, A, ---, C**D, ~~~~, E, ==, F, ,, G, ,,, H, .., I.J-1., --, 2, -, K, ||, L, -, #, 3]"; assertEquals(r, t.tokenize(s).toString()); // brackets s = "(1){2}[3]<4>"; r = "[(1), {, 2, }, [, 3, ], <, 4, >]"; assertEquals(r, t.tokenize(s).toString()); // twitter tags s = "@UserID #HashTag"; r = "[@UserID, #HashTag]"; assertEquals(r, t.tokenize(s).toString()); // abbreviations s = "Dr. ph.d. w.r.t. 1.2. A-1. a.1 (e.g., bcd. BCD. and. T. T.. T."; r = "[Dr., ph.d., w.r.t., 1.2., A-1., a.1, (, e.g., ,, bcd., BCD., and, ., T., T, .., T.]"; assertEquals(r, t.tokenize(s).toString()); // symbols in numbers s = ".1,-2.3,+4,567,8:9\"0\" -1+2=1 +82-2-000-0000 12/25/2014"; r = "[.1, ,, -2.3, ,, +4,567, ,, 8:9, \", 0, \", -1, +2, =, 1, +82-2-000-0000, 12/25/2014]"; assertEquals(r, t.tokenize(s).toString()); // currency s = "$1 E2 L3 USD1 2KPW $1 USD1 us$ US$ ub$"; r = "[$, 1, E2, L3, USD, 1, 2, KPW, $, 1, USD, 1, us$, US$, ub, $]"; assertEquals(r, t.tokenize(s).toString()); // unit s = "1m 2mm 3kg 4oz 1D 2nM 3CM 4LB"; r = "[1, m, 2, mm, 3, kg, 4, oz, 1, D, 2, nM, 3, CM, 4, LB]"; assertEquals(r, t.tokenize(s).toString()); // apostrophe s = "he's we'd I'm you'll they're I've didn't did'nt he'S DON'T gue'ss he'mm 90's"; r = "[he, 's, we, 'd, I, 'm, you, 'll, they, 're, I, 've, did, n't, did, 'nt, he, 'S, DO, N'T, gue'ss, he'mm, 90's]"; assertEquals(r, t.tokenize(s).toString()); // compounds s = "aint cannot don'cha d'ye i'mma dunno lemme LEMME"; r = "[ai, nt, can, not, do, n', cha, d', ye, i, 'm, ma, du, n, no, lem, me, LEM, ME]"; assertEquals(r, t.tokenize(s).toString()); // hyphens s = "dis-able cross-validation o-kay art-o-torium s-e-e art-work DIS-ABLE CROSS-VALIDATION"; r = "[dis-able, cross-validation, o-kay, art-o-torium, s-e-e, art, -, work, DIS-ABLE, CROSS-VALIDATION]"; assertEquals(r, t.tokenize(s).toString()); // years s = "'90 '90s '90's '100's"; r = "['90, '90s, '90's, ', 100's]"; assertEquals(r, t.tokenize(s).toString()); // ampersand s = "AT&T at&t A&1"; r = "[AT&T, at&t, A, &, 1]"; assertEquals(r, t.tokenize(s).toString()); // no. s = "No. 5 No."; r = "[No., 5, No, .]"; assertEquals(r, t.tokenize(s).toString()); // more examples s = "\"John & Mary's dog,\" Jane thought (to herself).\n" + "\"What a #$%!\n" + "a- ``I like AT&T''.\""; r = "[\", John, &, Mary, 's, dog, ,, \", Jane, thought, (, to, herself, ), ., \", What, a, #$%, !, a, -, ``, I, like, AT&T, '', ., \"]"; assertEquals(r, t.tokenize(s).toString()); s = "I said at 4:45pm."; r = "[I, said, at, 4:45, pm, .]"; assertEquals(r, t.tokenize(s).toString()); s = "I can't believe they wanna keep 40% of that. ``Whatcha think?'' \"I don't --- think so...,\""; r = "[I, ca, n't, believe, they, wan, na, keep, 40, %, of, that, ., ``, What, cha, think, ?, '', \", I, do, n't, ---, think, so, ..., ,, \"]"; assertEquals(r, t.tokenize(s).toString()); s = "You `paid' US$170,000?!\nYou should've paid only $16.75."; r = "[You, `, paid, ', US$, 170,000, ?!, You, should, 've, paid, only, $, 16.75, .]"; assertEquals(r, t.tokenize(s).toString()); s = " 1. Buy a new Chevrolet (37%-owned in the U.S.) . 15%"; r = "[1., Buy, a, new, Chevrolet, (, 37, %, -, owned, in, the, U.S., ), ., 15, %]"; assertEquals(r, t.tokenize(s).toString()); } }