/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.text; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.mahout.utils.MahoutTestCase; import org.junit.Test; /** * Unit tests for the MailArchivesClusteringAnalyzer text analyzer. */ public class MailArchivesClusteringAnalyzerTest extends MahoutTestCase { @Test public void testAnalysis() throws Exception { Analyzer analyzer = new MailArchivesClusteringAnalyzer(); String text = "A test message\n"; text += "atokenthatistoolongtobeusefulforclustertextanalysis\n"; text += "Mahout is a scalable, machine-learning LIBRARY\n"; text += "we've added some additional stopwords such as html, mailto, regards\t"; text += "apache_hadoop provides the foundation for scalability\n"; text += "www.nabble.com general-help@incubator.apache.org\n"; text += "public void int protected package"; Reader reader = new StringReader(text); // if you change the text above, then you may need to change this as well // order matters too String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad", "stopword", "apache_hadoop","provid", "foundat", "scalabl" }; TokenStream tokenStream = analyzer.tokenStream("test", reader); assertNotNull(tokenStream); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); int e = 0; while (tokenStream.incrementToken() && e < expectedTokens.length) { assertEquals(expectedTokens[e++], termAtt.toString()); } assertEquals(e, expectedTokens.length); } }