// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.token;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.regex.Pattern;
import org.junit.Test;
import com.google.common.collect.ImmutableList;
import com.twitter.common.text.tokenizer.RegexTokenizer;
public class TokenizedCharSequenceStreamTest {
TokenStream tokenizer = new RegexTokenizer.Builder()
.setDelimiterPattern(Pattern.compile(" "))
.build();
@Test
public void testWithBackupTokenizer() {
TokenStream stream = new TokenizedCharSequenceStream(tokenizer);
String text = "This is a test #hashtag";
TokenizedCharSequence tokenized = TokenizedCharSequence.createFrom(text, tokenizer);
// test with untokenized text
stream.reset(text);
assertEquals(ImmutableList.of("This", "is", "a", "test", "#hashtag"), stream.toStringList());
// test with already tokenized text
stream.reset(tokenized);
assertEquals(ImmutableList.of("This", "is", "a", "test", "#hashtag"), stream.toStringList());
}
@Test
public void testWithoutBackupTokenizer() {
TokenStream stream = new TokenizedCharSequenceStream();
String text = "This is a test #hashtag";
TokenizedCharSequence tokenized = TokenizedCharSequence.createFrom(text, tokenizer);
// test with already tokenized text
stream.reset(tokenized);
assertEquals(ImmutableList.of("This", "is", "a", "test", "#hashtag"), stream.toStringList());
try {
// this should throw IllegalArgumentException
stream.reset(text);
assertTrue("IllegalArgumentException was not thrown.", false);
} catch (IllegalArgumentException e) {
assertTrue(true);
}
}
@Test
public void testWithDummyTokenizer() {
TokenStream stream = new TokenizedCharSequenceStream(new TokenStream() {
@Override
public boolean incrementToken() {
throw new IllegalArgumentException("this should not be called!");
}
@Override
public void reset(CharSequence input) {
throw new IllegalArgumentException("this should not be called!");
}
});
String text = "This is a test #hashtag";
TokenizedCharSequence tokenized = TokenizedCharSequence.createFrom(text, tokenizer);
// test with already tokenized text
// this should not throw IllegalArgumentException
stream.reset(tokenized);
assertEquals(ImmutableList.of("This", "is", "a", "test", "#hashtag"), stream.toStringList());
}
}