package org.apache.lucene.analysis.tr;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import net.zemberek.erisim.Zemberek;
import net.zemberek.islemler.KelimeKokFrekansKiyaslayici;
import net.zemberek.islemler.cozumleme.CozumlemeSeviyesi;
import net.zemberek.tr.yapi.TurkiyeTurkcesi;
import net.zemberek.yapi.Kelime;
import net.zemberek.yapi.Kok;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
/**
* Factory for {@link Zemberek2StemFilter}.
*/
public class Zemberek2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private static final RootLengthComparator ROOT_LENGTH_COMPARATOR = new RootLengthComparator();
private static final RootMorphemeComparator ROOT_MORPHEME_COMPARATOR = new RootMorphemeComparator();
private static final KelimeKokFrekansKiyaslayici FREQUENCY_COMPARATOR = new KelimeKokFrekansKiyaslayici();
private final Zemberek zemberek = new Zemberek(new TurkiyeTurkcesi());
private final String strategy;
public Zemberek2StemFilterFactory(Map<String, String> args) {
super(args);
strategy = get(args, "strategy", "maxLength");
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public void inform(ResourceLoader loader) throws IOException {
}
@Override
public TokenStream create(TokenStream input) {
return new Zemberek2StemFilter(input);
}
/**
* Stemmer based on <a href="https://code.google.com/p/zemberek">Zemberek2</a>
*/
private final class Zemberek2StemFilter extends TokenFilter {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
public Zemberek2StemFilter(TokenStream input) {
super(input);
}
private String stem(Kelime[] cozumler, String aggregation) {
if ("first".equals(aggregation) || cozumler.length == 1) {
return cozumler[0].kok().icerik();
}
switch (aggregation) {
case "frequency":
Arrays.sort(cozumler, FREQUENCY_COMPARATOR);
return cozumler[0].kok().icerik();
case "maxLength":
Arrays.sort(cozumler, ROOT_LENGTH_COMPARATOR);
return cozumler[0].kok().icerik();
case "minLength":
Arrays.sort(cozumler, ROOT_LENGTH_COMPARATOR);
return cozumler[cozumler.length - 1].kok().icerik();
case "maxMorpheme":
Arrays.sort(cozumler, ROOT_MORPHEME_COMPARATOR);
return cozumler[0].kok().icerik();
case "minMorpheme":
Arrays.sort(cozumler, ROOT_MORPHEME_COMPARATOR);
return cozumler[cozumler.length - 1].kok().icerik();
default:
throw new RuntimeException("unknown strategy " + aggregation);
}
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
if (keywordAttribute.isKeyword()) return true;
final String term = termAttribute.toString();
final Kelime[] cozumler = zemberek.kelimeCozumle(term, CozumlemeSeviyesi.TUM_KOKLER);
if (cozumler.length == 0) return true;
final String s = stem(cozumler, strategy);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
termAttribute.setEmpty().append(s);
return true;
}
}
private static class RootLengthComparator implements Comparator<Kelime> {
@Override
public int compare(Kelime o1, Kelime o2) {
if (o1 == null || o2 == null) return -1;
final Kok k1 = o1.kok();
final Kok k2 = o2.kok();
return k2.icerik().length() - k1.icerik().length();
}
}
private static class RootMorphemeComparator implements Comparator<Kelime> {
@Override
public int compare(Kelime o1, Kelime o2) {
if (o1 == null || o2 == null) return -1;
return o2.ekler().size() - o1.ekler().size();
}
}
public static void main(String[] args) throws IOException {
StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");
Map<String, String> map = new HashMap<>();
map.put("strategy", "frequency");
Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);
WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
whitespaceTokenizer.setReader(reader);
TokenStream stream = factory.create(whitespaceTokenizer);
CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String term = termAttribute.toString();
System.out.println(term);
}
stream.end();
reader.close();
}
}