TRMorphStemFilter.java example

Explorer

lucene-solr-analysis-turkish-master
- src
  - main
    - java
      - org
        apache
        lucene
        App.java
        analysis
        tr
        TRMorphStemFilter.java
        TRMorphStemFilterFactory.java
        TurkishDeASCIIfyFilter.java
        TurkishDeASCIIfyFilterFactory.java
        Zemberek2DeASCIIfyFilterFactory.java
        Zemberek2StemFilterFactory.java
        Zemberek3StemFilter.java
        Zemberek3StemFilterFactory.java
        util
        PatternTableFactory.java
        Piper.java
  - test
    - java
      - org
        apache
        lucene
        tr
        TestTurkishDeASCIIfyFilter.java

package org.apache.lucene.analysis.tr;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tr.util.Piper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeSet;

/**
 * Stemmer based on <a href="https://github.com/coltekin/TRmorph">TRmorph</a>
 */
public final class TRMorphStemFilter extends TokenFilter {

    private static final Logger log = LoggerFactory.getLogger(TRMorphStemFilter.class);

    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
    private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);

    private final String aggregation;
    private final String lookup_fst;

    public TRMorphStemFilter(TokenStream input, String lookup_fst, String aggregation) {
        super(input);
        this.lookup_fst = lookup_fst;
        this.aggregation = aggregation;
    }

    @Override
    public boolean incrementToken() throws IOException {

        if (!input.incrementToken()) return false;
        if (keywordAttribute.isKeyword()) return true;

        /**
         *  copied from {@link org.apache.lucene.analysis.br.BrazilianStemFilter#incrementToken}
         */
        final String term = termAttribute.toString();
        final String s = stem(term);
        // If not stemmed, don't waste the time adjusting the token.
        if ((s != null) && !s.equals(term))
            termAttribute.setEmpty().append(s);

        return true;
    }

    private String stem(String word) throws IOException {

        List<String> parses = parse(word);

        TreeSet<String> set = new TreeSet<>();

        for (String parse : parses) {
            String[] parts = parse.split("\\s+");
            if (parts.length < 1) {
                log.warn("unexpected line " + parse);
                continue;
            }

            String stem = parts[1].trim();

            int i = stem.indexOf("<");

            if (i == -1) {
                if (stem.contains("+?"))
                    return word;
                else {
                    log.warn("unexpected stem " + stem);
                    continue;
                }
            }

            set.add(stem.substring(0, i));
        }

        if (set.size() == 1) return set.first();

        switch (aggregation) {
            case "max":
                return set.pollLast();
            case "min":
                return set.pollFirst();
            default:
                throw new RuntimeException("unknown strategy " + aggregation);
        }
    }

    private List<String> parse(String word) throws IOException {
        List<String> list = new ArrayList<>();
        java.lang.Runtime rt = java.lang.Runtime.getRuntime();
        java.lang.Process p2 = rt.exec(lookup_fst);
        Piper pipe = new Piper(new ByteArrayInputStream(word.getBytes(StandardCharsets.UTF_8)), p2.getOutputStream());
        new Thread(pipe).start();
        try {
            p2.waitFor();
        } catch (InterruptedException ie) {
            return list;
        }

        try (BufferedReader r = new BufferedReader(new java.io.InputStreamReader(p2.getInputStream()))) {
            String s;
            while ((s = r.readLine()) != null) {

                s = s.trim();
                if (s.length() == 0) continue;

                if (s.startsWith(word))
                    list.add(s);
                else
                    log.warn("unexpected line from word " + word + " " + s);
            }
        }
        return list;

    }
}