package de.tudarmstadt.ukp.dkpro.core.arktools; /** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.junit.Assert.assertEquals; import java.util.List; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.junit.Test; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.arktools.ArktweetTokenizer; public class ArktweetTokenizationTest { @Test public void testDummySentenceBoundary() throws AnalysisEngineProcessException, ResourceInitializationException { String text = " Content."made a pac lets see how long it last""; JCas tokenize = tokenize(text); assertEquals(1, JCasUtil.select(tokenize, Sentence.class).size()); } @Test public void testTokenization1() throws ResourceInitializationException, AnalysisEngineProcessException { String text = " Content."made a pac lets see how long it last""; List<Token> tokens = getTokens(text); assertNumberOfTokens(15, tokens.size()); assertTokenizationBoundaries(new String[] { "Content", ".", "&", "quot", ";", "made", "a", "pac", "lets", "see", "how", "long", "it", "last", """ }, tokens); } private void assertTokenizationBoundaries(String[] expected, List<Token> tokens) { for (int i = 0; i < expected.length; i++) { assertEquals(expected[i], tokens.get(i).getCoveredText()); } } private void assertNumberOfTokens(int expected, int numberOfTokens) { assertEquals(expected, numberOfTokens); } @Test public void testTokenization2() throws ResourceInitializationException, AnalysisEngineProcessException { String text = " Tiger Woods is up by 2at 18 via http://nascar.com/racebuddy"; List<Token> tokens = getTokens(text); assertNumberOfTokens(9, tokens.size()); assertTokenizationBoundaries(new String[] { "Tiger", "Woods", "is", "up", "by", "2at", "18", "via", "http://nascar.com/racebuddy" }, tokens); } @Test public void testTokenization3() throws ResourceInitializationException, AnalysisEngineProcessException { String text = " My cell phone screen is dead. Sooooooooooo, no texts and I don't know who's calling. Fuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuck"; List<Token> tokens = getTokens(text); assertNumberOfTokens(19, tokens.size()); assertTokenizationBoundaries(new String[] { "My", "cell", "phone", "screen", "is", "dead", ".", "Sooooooooooo", ",", "no", "texts", "and", "I", "don't", "know", "who's", "calling", "." }, tokens); } @Test public void testTokenization4() throws ResourceInitializationException, AnalysisEngineProcessException { String text = " "Im in love and I don't care who knows it !" -elf"; List<Token> tokens = getTokens(text); assertNumberOfTokens(16, tokens.size()); assertTokenizationBoundaries(new String[] { "&", "quot", ";", "Im", "in", "love", "and", "I", "don't", "care", "who", "knows", "it", "!", """, "-elf" }, tokens); } @Test public void testTokenization5() throws ResourceInitializationException, AnalysisEngineProcessException { String text = " I love him, and now, we're not even friends<\\3"; List<Token> tokens = getTokens(text); assertNumberOfTokens(13, tokens.size()); assertTokenizationBoundaries(new String[] { "I", "love", "him", ",", "and", "now", ",", "we're", "not", "even", "friends", "<", "\\3" }, tokens); } @Test public void testTokenization6() throws ResourceInitializationException, AnalysisEngineProcessException { String text = "@TextTonic "control" or "abuse"? I see them as Very different. Whilst we are into self promoting here goes http://tinyurl.com/cru3hu"; List<Token> tokens = getTokens(text); assertNumberOfTokens(29, tokens.size()); assertTokenizationBoundaries(new String[] { "@TextTonic", "&", "quot", ";", "control", """, "or", "&", "quot", ";", "abuse", """, "?", "I", "see", "them", "as", "Very", "different", ".", "Whilst", "we", "are", "into", "self", "promoting", "here", "goes", "http://tinyurl.com/cru3hu" }, tokens); } @Test public void testTokenization7() throws Exception { String text = "a baptism&they made it rain&kissed me on the head #IwasAppreciated"; List<Token> tokens = getTokens(text); String[] expectedToken = new String[] { "a", "baptism", "&", "they", "made", "it", "rain", "&", "kissed", "me", "on", "the", "head", "#IwasAppreciated" }; assertNumberOfTokens(expectedToken.length, tokens.size()); assertTokenizationBoundaries(expectedToken, tokens); } @Test public void testTokenization8() throws Exception { String text = "god & 100 days :>"; List<Token> tokens = getTokens(text); for (Token t : tokens) { System.out.println(t.getCoveredText()); } String[] expectedToken = new String[] { "god", "&", "amp", ";", "100", "days", ":", "&", "gt", ";" }; assertNumberOfTokens(expectedToken.length, tokens.size()); assertTokenizationBoundaries(expectedToken, tokens); } @Test public void testTokenization9() throws Exception { // This mutilated & can be found in json data obtained directly from twitter occurring // occasionally at the end of tweets String text = "car &a ..."; List<Token> tokens = getTokens(text); String[] expectedToken = new String[] { "car", "&", "a", "..." }; assertNumberOfTokens(expectedToken.length, tokens.size()); assertTokenizationBoundaries(expectedToken, tokens); } private List<Token> getTokens(String text) throws AnalysisEngineProcessException, ResourceInitializationException { JCas jcas = tokenize(text); List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, 0, jcas.getDocumentText() .length()); return tokens; } private JCas tokenize(String text) throws ResourceInitializationException, AnalysisEngineProcessException { AnalysisEngineDescription segmenter = createEngineDescription(ArktweetTokenizer.class); AnalysisEngine segEngine = UIMAFramework.produceAnalysisEngine(segmenter); JCas testCas = segEngine.newJCas(); testCas.setDocumentLanguage("en"); testCas.setDocumentText(text); segEngine.process(testCas); return testCas; } }