package org.apache.lucene.analysis.fr;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* Test case for FrenchAnalyzer.
*
* @version $version$
*/
public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
public void testAnalyzer() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(fa, "", new String[] {
});
assertAnalyzesTo(
fa,
"chien chat cheval",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
"chien CHAT CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
" chien ,? + = - CHAT /: > CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
assertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
new String[] { "mot", "entreguillemet" });
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
assertAnalyzesTo(
fa,
"Jean-François",
new String[] { "jean", "françois" });
// 2. stopwords
assertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
"lanc",
"chism",
"habit",
"chist",
"élément",
"captif" });
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] { "fin", "souffr", "rug" });
// some everything else
// aujourd'hui stays one term which is OK
assertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[] {
"c3po",
"aujourd'hui",
"oeuf",
"ïâöûàä",
"anticonstitutionnel",
"jav" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
}
/**
* @deprecated remove this test for Lucene 4.0
*/
@Deprecated
public void testAnalyzer30() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(fa, "", new String[] {
});
assertAnalyzesTo(
fa,
"chien chat cheval",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
"chien CHAT CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
" chien ,? + = - CHAT /: > CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
assertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
new String[] { "mot", "entreguillemet" });
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
assertAnalyzesTo(
fa,
"Jean-François",
new String[] { "jean", "françois" });
// 2. stopwords
assertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
"lanc",
"chism",
"habit",
"chist",
"élément",
"captif" });
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] { "fin", "souffr", "rug" });
// some everything else
// aujourd'hui stays one term which is OK
assertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[] {
"c3po",
"aujourd'hui",
"oeuf",
"ïâöûàä",
"anticonstitutionnel",
"jav" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
}
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
// stopwords
assertAnalyzesToReuse(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// some nouns and adjectives
assertAnalyzesToReuse(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
"lanc",
"chism",
"habit",
"chist",
"élément",
"captif" });
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
*/
public void testExclusionTableReuse() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
fa.setStemExclusionTable(new String[] { "habitable" });
assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
}
public void testExclusionTableViaCtor() throws Exception {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
set.add("habitable");
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
CharArraySet.EMPTY_SET, set);
assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
"chist" });
fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
"chist" });
}
public void testElision() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
}
/**
* Prior to 3.1, this analyzer had no lowercase filter.
* stopwords were case sensitive. Preserve this for back compat.
* @deprecated Remove this test in Lucene 4.0
*/
@Deprecated
public void testBuggyStopwordsCasing() throws IOException {
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
assertAnalyzesTo(a, "Votre", new String[] { "votr" });
}
/**
* Test that stopwords are not case sensitive
*/
public void testStopwordsCasing() throws IOException {
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
assertAnalyzesTo(a, "Votre", new String[] { });
}
}