// ================================================================================================= // Copyright 2011 Twitter, Inc. // ------------------------------------------------------------------------------------------------- // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this work except in compliance with the License. // You may obtain a copy of the License in the LICENSE file, or at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ================================================================================================= package com.twitter.common.text.token; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.util.List; import org.junit.Test; import com.google.common.collect.ImmutableList; import com.twitter.common.text.token.TokenizedCharSequence.Token; import com.twitter.common.text.token.attribute.TokenType; public class TokenizedCharSequenceTest { @Test(expected=NullPointerException.class) public void testNullConstructor() { new TokenizedCharSequence.Builder(null).build(); } @Test public void testEmptyConstructor() { // it is OK to create TokenizedCharSequence with empty text. new TokenizedCharSequence.Builder("").build(); } @Test public void testTokenizedCharSequence() { // exactly same contents String text = "test test"; TokenizedCharSequence text1 = new TokenizedCharSequence.Builder(text).addToken(0, 4).addToken(5, 4).build(); TokenizedCharSequence text2 = new TokenizedCharSequence.Builder(text).addToken(0, 4).addToken(5, 4).build(); assertTrue(text1 != text2); assertEquals(text1, text2); assertEquals(text1.toString(), text2.toString()); assertEquals(text1.hashCode(), text2.hashCode()); // different contents text = "test test test"; TokenizedCharSequence text3 = new TokenizedCharSequence.Builder(text).addToken(0, 4).addToken(5, 4).addToken(10, 4).build(); assertFalse(text1.equals(text3)); assertFalse(text1.toString().equals(text3.toString())); assertFalse(text1.hashCode() == text3.hashCode()); // same contents but not String StringBuffer buf = new StringBuffer("test test"); TokenizedCharSequence text4 = new TokenizedCharSequence.Builder(buf).addToken(0, 4).addToken(5, 4).build(); assertTrue(text1 != text4); assertEquals(text1, text4); assertEquals(text1.toString(), text4.toString()); assertEquals(text1.hashCode(), text4.hashCode()); } @Test public void testGetTokensOf() { String text = "test, #hashtag, @username."; TokenizedCharSequence tokenized = new TokenizedCharSequence.Builder(text) .addToken(0, 4, TokenType.TOKEN) .addToken(4, 1, TokenType.PUNCTUATION) .addToken(6, 8, TokenType.HASHTAG) .addToken(14, 1, TokenType.PUNCTUATION) .addToken(16, 9, TokenType.USERNAME) .addToken(25, 1, TokenType.PUNCTUATION) .build(); assertEquals(3, tokenized.getTokensOf(TokenType.TOKEN, TokenType.HASHTAG, TokenType.USERNAME).size()); assertEquals(3, tokenized.getTokensOf(TokenType.PUNCTUATION).size()); List<String> hashtags = tokenized.getTokenStringsOf(TokenType.HASHTAG); assertEquals(ImmutableList.of("#hashtag"), hashtags); List<String> hash_user = tokenized.getTokenStringsOf(TokenType.HASHTAG, TokenType.USERNAME); assertEquals(ImmutableList.of("#hashtag", "@username"), hash_user); // no token type specified. List<Token> tokens = tokenized.getTokensOf(); assertTrue(tokens.isEmpty()); // test unavailable token type tokens = tokenized.getTokensOf(TokenType.EMOTICON); assertTrue(tokens.isEmpty()); tokens = tokenized.getTokensOf(TokenType.TOKEN, TokenType.EMOTICON); assertEquals(1, tokens.size()); assertEquals("test", tokens.get(0).toString()); } @Test public void testTokenizeToken() { String text = "abCDef"; TokenizedCharSequence tokenized = new TokenizedCharSequence.Builder(text) .addToken(0, 2, TokenType.TOKEN) .addToken(2, 2, TokenType.TOKEN) .addToken(4, 2, TokenType.TOKEN) .build(); List<Token> tokens = tokenized.getTokens(); assertEquals(3, tokens.size()); // tokenize "CD" into "C" and "D" Token tokenC = tokens.get(1).tokenize(0, 1); Token tokenD = tokens.get(1).tokenize(1, 1); assertEquals("C", tokenC.toString()); assertEquals(2, tokenC.getOffset()); assertEquals(1, tokenC.getLength()); assertEquals("D", tokenD.toString()); assertEquals(3, tokenD.getOffset()); assertEquals(1, tokenD.getLength()); } }