package org.apache.lucene.analysis.fr; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; /** * Test case for FrenchAnalyzer. * * @version $version$ */ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase { public void testAnalyzer() throws Exception { FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesTo(fa, "", new String[] { }); assertAnalyzesTo( fa, "chien chat cheval", new String[] { "chien", "chat", "cheval" }); assertAnalyzesTo( fa, "chien CHAT CHEVAL", new String[] { "chien", "chat", "cheval" }); assertAnalyzesTo( fa, " chien ,? + = - CHAT /: > CHEVAL", new String[] { "chien", "chat", "cheval" }); assertAnalyzesTo(fa, "chien++", new String[] { "chien" }); assertAnalyzesTo( fa, "mot \"entreguillemet\"", new String[] { "mot", "entreguillemet" }); // let's do some french specific tests now /* 1. couldn't resist I would expect this to stay one term as in French the minus sign is often used for composing words */ assertAnalyzesTo( fa, "Jean-François", new String[] { "jean", "françois" }); // 2. stopwords assertAnalyzesTo( fa, "le la chien les aux chat du des à cheval", new String[] { "chien", "chat", "cheval" }); // some nouns and adjectives assertAnalyzesTo( fa, "lances chismes habitable chiste éléments captifs", new String[] { "lanc", "chism", "habit", "chist", "élément", "captif" }); // some verbs assertAnalyzesTo( fa, "finissions souffrirent rugissante", new String[] { "fin", "souffr", "rug" }); // some everything else // aujourd'hui stays one term which is OK assertAnalyzesTo( fa, "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ", new String[] { "c3po", "aujourd'hui", "oeuf", "ïâöûàä", "anticonstitutionnel", "jav" }); // some more everything else // here 1940-1945 stays as one term, 1940:1945 not ? assertAnalyzesTo( fa, "33Bis 1940-1945 1940:1945 (---i+++)*", new String[] { "33bis", "1940-1945", "1940", "1945", "i" }); } /** * @deprecated remove this test for Lucene 4.0 */ @Deprecated public void testAnalyzer30() throws Exception { FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30); assertAnalyzesTo(fa, "", new String[] { }); assertAnalyzesTo( fa, "chien chat cheval", new String[] { "chien", "chat", "cheval" }); assertAnalyzesTo( fa, "chien CHAT CHEVAL", new String[] { "chien", "chat", "cheval" }); assertAnalyzesTo( fa, " chien ,? + = - CHAT /: > CHEVAL", new String[] { "chien", "chat", "cheval" }); assertAnalyzesTo(fa, "chien++", new String[] { "chien" }); assertAnalyzesTo( fa, "mot \"entreguillemet\"", new String[] { "mot", "entreguillemet" }); // let's do some french specific tests now /* 1. couldn't resist I would expect this to stay one term as in French the minus sign is often used for composing words */ assertAnalyzesTo( fa, "Jean-François", new String[] { "jean", "françois" }); // 2. stopwords assertAnalyzesTo( fa, "le la chien les aux chat du des à cheval", new String[] { "chien", "chat", "cheval" }); // some nouns and adjectives assertAnalyzesTo( fa, "lances chismes habitable chiste éléments captifs", new String[] { "lanc", "chism", "habit", "chist", "élément", "captif" }); // some verbs assertAnalyzesTo( fa, "finissions souffrirent rugissante", new String[] { "fin", "souffr", "rug" }); // some everything else // aujourd'hui stays one term which is OK assertAnalyzesTo( fa, "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ", new String[] { "c3po", "aujourd'hui", "oeuf", "ïâöûàä", "anticonstitutionnel", "jav" }); // some more everything else // here 1940-1945 stays as one term, 1940:1945 not ? assertAnalyzesTo( fa, "33Bis 1940-1945 1940:1945 (---i+++)*", new String[] { "33bis", "1940-1945", "1940", "1945", "i" }); } public void testReusableTokenStream() throws Exception { FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT); // stopwords assertAnalyzesToReuse( fa, "le la chien les aux chat du des à cheval", new String[] { "chien", "chat", "cheval" }); // some nouns and adjectives assertAnalyzesToReuse( fa, "lances chismes habitable chiste éléments captifs", new String[] { "lanc", "chism", "habit", "chist", "élément", "captif" }); } /* * Test that changes to the exclusion table are applied immediately * when using reusable token streams. */ public void testExclusionTableReuse() throws Exception { FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" }); fa.setStemExclusionTable(new String[] { "habitable" }); assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" }); } public void testExclusionTableViaCtor() throws Exception { CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true); set.add("habitable"); FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable", "chist" }); fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set); assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable", "chist" }); } public void testElision() throws Exception { FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT); assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" }); } /** * Prior to 3.1, this analyzer had no lowercase filter. * stopwords were case sensitive. Preserve this for back compat. * @deprecated Remove this test in Lucene 4.0 */ @Deprecated public void testBuggyStopwordsCasing() throws IOException { FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30); assertAnalyzesTo(a, "Votre", new String[] { "votr" }); } /** * Test that stopwords are not case sensitive */ public void testStopwordsCasing() throws IOException { FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31); assertAnalyzesTo(a, "Votre", new String[] { }); } }