/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.linguistic; import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer; import org.carrot2.text.analysis.ITokenizer; import org.junit.Test; /** * Test {@link ExtendedWhitespaceTokenizer}. */ public class ExtendedWhitespaceTokenizerTest extends TokenizerTestBase { @Override protected ITokenizer createTokenStream() { return new ExtendedWhitespaceTokenizer(); } @Test public void testTermTokens() { String test = " simple simple's simples` terms simpleterm 9numterm numerm99x \"quoted string\""; TokenImage [] tokens = { new TokenImage("simple", ITokenizer.TT_TERM), new TokenImage("simple's", ITokenizer.TT_TERM), new TokenImage("simples`", ITokenizer.TT_TERM), new TokenImage("terms", ITokenizer.TT_TERM), new TokenImage("simpleterm", ITokenizer.TT_TERM), new TokenImage("9numterm", ITokenizer.TT_TERM), new TokenImage("numerm99x", ITokenizer.TT_TERM), new TokenImage("quoted", ITokenizer.TT_TERM), new TokenImage("string", ITokenizer.TT_TERM) }; assertEqualTokens(test, tokens); } @Test public void testSymbolTokens() { String test = " ... S_NI_P token"; TokenImage [] tokens = { new TokenImage("...", ITokenizer.TT_PUNCTUATION | ITokenizer.TF_SEPARATOR_SENTENCE), new TokenImage("S_NI_P", ITokenizer.TT_FILE), new TokenImage("token", ITokenizer.TT_TERM) }; assertEqualTokens(test, tokens); } @Test public void testEmailTokens() { String test = "e-mails dweiss@go2.pl dawid.weiss@go2.com.pl bubu@some-host.com me@me.org bubu99@yahoo.com"; TokenImage [] tokens = { new TokenImage("e-mails", ITokenizer.TT_HYPHTERM), new TokenImage("dweiss@go2.pl", ITokenizer.TT_EMAIL), new TokenImage("dawid.weiss@go2.com.pl", ITokenizer.TT_EMAIL), new TokenImage("bubu@some-host.com", ITokenizer.TT_EMAIL), new TokenImage("me@me.org", ITokenizer.TT_EMAIL), new TokenImage("bubu99@yahoo.com", ITokenizer.TT_EMAIL) }; assertEqualTokens(test, tokens); } @Test public void testUrlTokens() { final String allCharsUrl = "http://url.with.all.allowed.characters/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/!*'();:@&=+$,/?%#[]-_.~"; String test = " urls http://www.google.com http://www.cs.put.poznan.pl/index.jsp?query=term&query2=term " + " ftp://ftp.server.pl www.google.com not.an.url go2.pl/mail http://www.digimine.com/usama/datamine/." + " http://www.herold.at/gelbe-seiten/krems-an-der-donau/lDk8q/yoga-krems-wachau-j%C3%BCrgen-ullrich/" + " " + allCharsUrl; TokenImage [] tokens = { new TokenImage("urls", ITokenizer.TT_TERM), new TokenImage("http://www.google.com", ITokenizer.TT_FULL_URL), new TokenImage( "http://www.cs.put.poznan.pl/index.jsp?query=term&query2=term", ITokenizer.TT_FULL_URL), new TokenImage("ftp://ftp.server.pl", ITokenizer.TT_FULL_URL), new TokenImage("www.google.com", ITokenizer.TT_BARE_URL), new TokenImage("not.an.url", ITokenizer.TT_FILE), new TokenImage("go2.pl/mail", ITokenizer.TT_FULL_URL), new TokenImage("http://www.digimine.com/usama/datamine/.", ITokenizer.TT_FULL_URL), new TokenImage("http://www.herold.at/gelbe-seiten/krems-an-der-donau/lDk8q/yoga-krems-wachau-j%C3%BCrgen-ullrich/", ITokenizer.TT_FULL_URL), new TokenImage(allCharsUrl, ITokenizer.TT_FULL_URL), }; assertEqualTokens(test, tokens); } @Test public void testAcronymTokens() { String test = " acronyms I.B.M. S.C. z o.o. AT&T garey&johnson&willet"; TokenImage [] tokens = { new TokenImage("acronyms", ITokenizer.TT_TERM), new TokenImage("I.B.M.", ITokenizer.TT_ACRONYM), new TokenImage("S.C.", ITokenizer.TT_ACRONYM), new TokenImage("z", ITokenizer.TT_TERM), new TokenImage("o.o.", ITokenizer.TT_ACRONYM), new TokenImage("AT&T", ITokenizer.TT_ACRONYM), new TokenImage("garey&johnson&willet", ITokenizer.TT_ACRONYM), }; assertEqualTokens(test, tokens); } @Test public void testNumericTokens() { String test = " numeric 127 0 12.87 12,12 12-2003/23 term2003 2003term "; TokenImage [] tokens = { new TokenImage("numeric", ITokenizer.TT_TERM), new TokenImage("127", ITokenizer.TT_NUMERIC), new TokenImage("0", ITokenizer.TT_NUMERIC), new TokenImage("12.87", ITokenizer.TT_NUMERIC), new TokenImage("12,12", ITokenizer.TT_NUMERIC), new TokenImage("12-2003/23", ITokenizer.TT_NUMERIC), new TokenImage("term2003", ITokenizer.TT_TERM), new TokenImage("2003term", ITokenizer.TT_TERM) }; assertEqualTokens(test, tokens); } @Test public void testNastyUrlTokens() { String test = "http://r.office.microsoft.com/r/rlidLiveMeeting?p1=7&p2=en_US&p3=LMInfo&p4=DownloadWindowsConsole " + "https://www.livemeeting.com/cc/askme/join?id=58937J&role=present&pw=mNjC%27%25%3D%218"; TokenImage [] tokens = { new TokenImage( "http://r.office.microsoft.com/r/rlidLiveMeeting?p1=7&p2=en_US&p3=LMInfo&p4=DownloadWindowsConsole", ITokenizer.TT_FULL_URL), new TokenImage( "https://www.livemeeting.com/cc/askme/join?id=58937J&role=present&pw=mNjC%27%25%3D%218", ITokenizer.TT_FULL_URL), }; assertEqualTokens(test, tokens); } @Test public void testKoreanWordSplit() { String test = "안녕하세요 한글입니다"; TokenImage [] tokens = { new TokenImage("안녕하세요", ITokenizer.TT_TERM), new TokenImage("한글입니다", ITokenizer.TT_TERM), }; assertEqualTokens(test, tokens); } @Test public void punctuationAndSentenceMarkers() { String test = "Dawid Weiss, Data Mining!"; TokenImage [] tokens = { new TokenImage("Dawid", ITokenizer.TT_TERM), new TokenImage("Weiss", ITokenizer.TT_TERM), new TokenImage(",", ITokenizer.TT_PUNCTUATION), new TokenImage("Data", ITokenizer.TT_TERM), new TokenImage("Mining", ITokenizer.TT_TERM), new TokenImage("!", ITokenizer.TT_PUNCTUATION | ITokenizer.TF_SEPARATOR_SENTENCE) }; assertEqualTokens(test, tokens); } }