/*-
*
* * Copyright 2015 Skymind,Inc.
* *
* * Licensed under the Apache License, Version 2.0 (the "License");
* * you may not use this file except in compliance with the License.
* * You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
*
*/
package org.deeplearning4j.text.tokenization.tokenizer;
import org.apache.commons.io.FileUtils;
import org.datavec.api.util.ClassPathResource;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class DefaulTokenizerTests {
protected static final Logger log = LoggerFactory.getLogger(DefaulTokenizerTests.class);
@Test
public void testDefaultTokenizer1() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
@Test
public void testDefaultTokenizer2() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
tokenizer2.countTokens();
while (tokenizer.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
assertEquals(tok1, tok2);
}
System.out.println("-----------------------------------------------");
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
+ Math.abs(stringCount - stringCount2));
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
@Test
public void testDefaultTokenizer3() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
}
@Test
public void testDefaultStreamTokenizer() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
assertEquals(5, tokenizer2.countTokens());
int cnt = 0;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer2.nextToken();
log.info(tok1);
cnt++;
}
assertEquals(5, cnt);
}
}