package org.apache.lucene.analysis.tr;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import zemberek.morphology.parser.MorphParse;
import zemberek.morphology.parser.MorphParser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
/**
* Stemmer based on <a href="https://github.com/ahmetaa/zemberek-nlp">Zemberek3</a>
*/
public final class Zemberek3StemFilter extends TokenFilter {
private final MorphParser parser;
private final String aggregation;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
public Zemberek3StemFilter(TokenStream input, MorphParser parser, String aggregation) {
super(input);
this.parser = parser;
this.aggregation = aggregation;
}
private static List<MorphParse> selectMorphemes(List<MorphParse> parses, String strategy) {
// if 0 or 1
if (parses.size() < 2) return parses;
switch (strategy) {
case "all":
return parses;
case "maxMorpheme":
final int max = parses.stream().map(morphParse -> morphParse.inflectionalGroups.size()).max(Comparator.naturalOrder()).get();
return parses.stream().filter(parse -> parse.inflectionalGroups.size() == max).collect(Collectors.toList());
case "minMorpheme":
final int min = parses.stream().map(morphParse -> morphParse.inflectionalGroups.size()).min(Comparator.naturalOrder()).get();
return parses.stream().filter(parse -> parse.inflectionalGroups.size() == min).collect(Collectors.toList());
default:
throw new RuntimeException("unknown strategy " + strategy);
}
}
private static List<String> morphToString(List<MorphParse> parses, String methodName) {
List<String> list = new ArrayList<>();
switch (methodName) {
case "stems":
for (MorphParse parse : parses)
list.addAll(parse.getStems());
return list;
case "lemmas":
for (MorphParse parse : parses)
list.addAll(parse.getLemmas());
return list;
case "lemma":
return parses.stream().map(MorphParse::getLemma).collect(Collectors.toList());
case "root":
return parses.stream().map(morphParse -> morphParse.root).collect(Collectors.toList());
default:
throw new RuntimeException("unknown method name " + methodName);
}
}
static String stem(List<MorphParse> parses, String aggregation) {
List<MorphParse> alternatives = selectMorphemes(parses, "minMorpheme");
List<String> candidates = morphToString(alternatives, "lemmas");
switch (aggregation) {
case "maxLength":
return Collections.max(candidates, Comparator.comparing(String::length));
case "minLength":
return Collections.min(candidates, Comparator.comparing(String::length));
default:
throw new RuntimeException("unknown strategy " + aggregation);
}
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
if (keywordAttribute.isKeyword()) return true;
/**
* copied from {@link org.apache.lucene.analysis.br.BrazilianStemFilter#incrementToken}
*/
final String term = termAttribute.toString();
final List<MorphParse> parses = parser.parse(term);
if (parses.size() == 0) return true;
final String s = stem(parses, aggregation);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
termAttribute.setEmpty().append(s);
return true;
}
}