/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.morfologik; import java.io.IOException; import java.util.TreeSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** * TODO: The tests below rely on the order of returned lemmas, which is probably not good. */ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase { private Analyzer getTestAnalyzer() { return new MorfologikAnalyzer(); } /** Test stemming of single tokens with Morfologik library. */ public final void testSingleTokens() throws IOException { Analyzer a = getTestAnalyzer(); assertAnalyzesTo(a, "a", new String[] { "a" }); assertAnalyzesTo(a, "liście", new String[] { "liście", "liść", "list", "lista" }); assertAnalyzesTo(a, "danych", new String[] { "dany", "dana", "dane", "dać" }); assertAnalyzesTo(a, "ęóąśłżźćń", new String[] { "ęóąśłżźćń" }); a.close(); } /** Test stemming of multiple tokens and proper term metrics. */ public final void testMultipleTokens() throws IOException { Analyzer a = getTestAnalyzer(); assertAnalyzesTo( a, "liście danych", new String[] { "liście", "liść", "list", "lista", "dany", "dana", "dane", "dać" }, new int[] { 0, 0, 0, 0, 7, 7, 7, 7 }, new int[] { 6, 6, 6, 6, 13, 13, 13, 13 }, new int[] { 1, 0, 0, 0, 1, 0, 0, 0 }); assertAnalyzesTo( a, "T. Gl\u00FCcksberg", new String[] { "tom", "tona", "Gl\u00FCcksberg" }, new int[] { 0, 0, 3 }, new int[] { 1, 1, 13 }, new int[] { 1, 0, 1 }); a.close(); } @SuppressWarnings("unused") private void dumpTokens(String input) throws IOException { try (Analyzer a = getTestAnalyzer(); TokenStream ts = a.tokenStream("dummy", input)) { ts.reset(); MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class); CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(charTerm.toString() + " => " + attribute.getTags()); } ts.end(); } } /** Test reuse of MorfologikFilter with leftover stems. */ public final void testLeftoverStems() throws IOException { Analyzer a = getTestAnalyzer(); try (TokenStream ts_1 = a.tokenStream("dummy", "liście")) { CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class); ts_1.reset(); ts_1.incrementToken(); assertEquals("first stream", "liście", termAtt_1.toString()); ts_1.end(); } try (TokenStream ts_2 = a.tokenStream("dummy", "danych")) { CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class); ts_2.reset(); ts_2.incrementToken(); assertEquals("second stream", "dany", termAtt_2.toString()); ts_2.end(); } a.close(); } /** Test stemming of mixed-case tokens. */ public final void testCase() throws IOException { Analyzer a = getTestAnalyzer(); assertAnalyzesTo(a, "AGD", new String[] { "AGD", "artykuły gospodarstwa domowego" }); assertAnalyzesTo(a, "agd", new String[] { "artykuły gospodarstwa domowego" }); assertAnalyzesTo(a, "Poznania", new String[] { "Poznań" }); assertAnalyzesTo(a, "poznania", new String[] { "poznanie", "poznać" }); assertAnalyzesTo(a, "Aarona", new String[] { "Aaron" }); assertAnalyzesTo(a, "aarona", new String[] { "aarona" }); assertAnalyzesTo(a, "Liście", new String[] { "liście", "liść", "list", "lista" }); a.close(); } private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException { ts.incrementToken(); assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString()); TreeSet<String> actual = new TreeSet<>(); TreeSet<String> expected = new TreeSet<>(); for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) { actual.add(b.toString()); } for (String s : tags) { expected.add(s); } if (!expected.equals(actual)) { System.out.println("Expected:\n" + expected); System.out.println("Actual:\n" + actual); assertEquals(expected, actual); } } /** Test morphosyntactic annotations. */ public final void testPOSAttribute() throws IOException { try (Analyzer a = getTestAnalyzer(); TokenStream ts = a.tokenStream("dummy", "liście")) { ts.reset(); assertPOSToken(ts, "liście", "subst:sg:acc:n2", "subst:sg:nom:n2", "subst:sg:voc:n2"); assertPOSToken(ts, "liść", "subst:pl:acc:m3", "subst:pl:nom:m3", "subst:pl:voc:m3"); assertPOSToken(ts, "list", "subst:sg:loc:m3", "subst:sg:voc:m3"); assertPOSToken(ts, "lista", "subst:sg:dat:f", "subst:sg:loc:f"); ts.end(); } } /** */ public final void testKeywordAttrTokens() throws IOException { Analyzer a = new MorfologikAnalyzer() { @Override protected TokenStreamComponents createComponents(String field) { final CharArraySet keywords = new CharArraySet(1, false); keywords.add("liście"); final Tokenizer src = new StandardTokenizer(); TokenStream result = new StandardFilter(src); result = new SetKeywordMarkerFilter(result, keywords); result = new MorfologikFilter(result); return new TokenStreamComponents(src, result); } }; assertAnalyzesTo( a, "liście danych", new String[] { "liście", "dany", "dana", "dane", "dać" }, new int[] { 0, 7, 7, 7, 7 }, new int[] { 6, 13, 13, 13, 13 }, new int[] { 1, 1, 0, 0, 0 }); a.close(); } /** blast some random strings through the analyzer */ public void testRandom() throws Exception { Analyzer a = getTestAnalyzer(); checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); a.close(); } }