package org.apache.lucene.analysis.tr; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tr.util.Piper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.TreeSet; /** * Stemmer based on <a href="https://github.com/coltekin/TRmorph">TRmorph</a> */ public final class TRMorphStemFilter extends TokenFilter { private static final Logger log = LoggerFactory.getLogger(TRMorphStemFilter.class); private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); private final String aggregation; private final String lookup_fst; public TRMorphStemFilter(TokenStream input, String lookup_fst, String aggregation) { super(input); this.lookup_fst = lookup_fst; this.aggregation = aggregation; } @Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; if (keywordAttribute.isKeyword()) return true; /** * copied from {@link org.apache.lucene.analysis.br.BrazilianStemFilter#incrementToken} */ final String term = termAttribute.toString(); final String s = stem(term); // If not stemmed, don't waste the time adjusting the token. if ((s != null) && !s.equals(term)) termAttribute.setEmpty().append(s); return true; } private String stem(String word) throws IOException { List<String> parses = parse(word); TreeSet<String> set = new TreeSet<>(); for (String parse : parses) { String[] parts = parse.split("\\s+"); if (parts.length < 1) { log.warn("unexpected line " + parse); continue; } String stem = parts[1].trim(); int i = stem.indexOf("<"); if (i == -1) { if (stem.contains("+?")) return word; else { log.warn("unexpected stem " + stem); continue; } } set.add(stem.substring(0, i)); } if (set.size() == 1) return set.first(); switch (aggregation) { case "max": return set.pollLast(); case "min": return set.pollFirst(); default: throw new RuntimeException("unknown strategy " + aggregation); } } private List<String> parse(String word) throws IOException { List<String> list = new ArrayList<>(); java.lang.Runtime rt = java.lang.Runtime.getRuntime(); java.lang.Process p2 = rt.exec(lookup_fst); Piper pipe = new Piper(new ByteArrayInputStream(word.getBytes(StandardCharsets.UTF_8)), p2.getOutputStream()); new Thread(pipe).start(); try { p2.waitFor(); } catch (InterruptedException ie) { return list; } try (BufferedReader r = new BufferedReader(new java.io.InputStreamReader(p2.getInputStream()))) { String s; while ((s = r.readLine()) != null) { s = s.trim(); if (s.length() == 0) continue; if (s.startsWith(word)) list.add(s); else log.warn("unexpected line from word " + word + " " + s); } } return list; } }