Zemberek3StemFilter.java example

Explorer

lucene-solr-analysis-turkish-master
- src
  - main
    - java
      - org
        apache
        lucene
        App.java
        analysis
        tr
        TRMorphStemFilter.java
        TRMorphStemFilterFactory.java
        TurkishDeASCIIfyFilter.java
        TurkishDeASCIIfyFilterFactory.java
        Zemberek2DeASCIIfyFilterFactory.java
        Zemberek2StemFilterFactory.java
        Zemberek3StemFilter.java
        Zemberek3StemFilterFactory.java
        util
        PatternTableFactory.java
        Piper.java
  - test
    - java
      - org
        apache
        lucene
        tr
        TestTurkishDeASCIIfyFilter.java

package org.apache.lucene.analysis.tr;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import zemberek.morphology.parser.MorphParse;
import zemberek.morphology.parser.MorphParser;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

/**
 * Stemmer based on <a href="https://github.com/ahmetaa/zemberek-nlp">Zemberek3</a>
 */
public final class Zemberek3StemFilter extends TokenFilter {

    private final MorphParser parser;
    private final String aggregation;

    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
    private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);

    public Zemberek3StemFilter(TokenStream input, MorphParser parser, String aggregation) {
        super(input);
        this.parser = parser;
        this.aggregation = aggregation;
    }

    private static List<MorphParse> selectMorphemes(List<MorphParse> parses, String strategy) {

        // if 0 or 1
        if (parses.size() < 2) return parses;

        switch (strategy) {
            case "all":
                return parses;
            case "maxMorpheme":
                final int max = parses.stream().map(morphParse -> morphParse.inflectionalGroups.size()).max(Comparator.naturalOrder()).get();
                return parses.stream().filter(parse -> parse.inflectionalGroups.size() == max).collect(Collectors.toList());
            case "minMorpheme":
                final int min = parses.stream().map(morphParse -> morphParse.inflectionalGroups.size()).min(Comparator.naturalOrder()).get();
                return parses.stream().filter(parse -> parse.inflectionalGroups.size() == min).collect(Collectors.toList());
            default:
                throw new RuntimeException("unknown strategy " + strategy);

        }
    }

    private static List<String> morphToString(List<MorphParse> parses, String methodName) {

        List<String> list = new ArrayList<>();

        switch (methodName) {
            case "stems":
                for (MorphParse parse : parses)
                    list.addAll(parse.getStems());
                return list;
            case "lemmas":
                for (MorphParse parse : parses)
                    list.addAll(parse.getLemmas());
                return list;
            case "lemma":
                return parses.stream().map(MorphParse::getLemma).collect(Collectors.toList());
            case "root":
                return parses.stream().map(morphParse -> morphParse.root).collect(Collectors.toList());
            default:
                throw new RuntimeException("unknown method name " + methodName);
        }


    }

    static String stem(List<MorphParse> parses, String aggregation) {

        List<MorphParse> alternatives = selectMorphemes(parses, "minMorpheme");

        List<String> candidates = morphToString(alternatives, "lemmas");

        switch (aggregation) {
            case "maxLength":
                return Collections.max(candidates, Comparator.comparing(String::length));
            case "minLength":
                return Collections.min(candidates, Comparator.comparing(String::length));
            default:
                throw new RuntimeException("unknown strategy " + aggregation);
        }
    }

    @Override
    public boolean incrementToken() throws IOException {

        if (!input.incrementToken()) return false;
        if (keywordAttribute.isKeyword()) return true;

        /**
         *  copied from {@link org.apache.lucene.analysis.br.BrazilianStemFilter#incrementToken}
         */
        final String term = termAttribute.toString();

        final List<MorphParse> parses = parser.parse(term);
        if (parses.size() == 0) return true;

        final String s = stem(parses, aggregation);
        // If not stemmed, don't waste the time adjusting the token.
        if ((s != null) && !s.equals(term))
            termAttribute.setEmpty().append(s);

        return true;
    }
}