package org.apache.lucene.analysis.tr; import net.zemberek.erisim.Zemberek; import net.zemberek.islemler.KelimeKokFrekansKiyaslayici; import net.zemberek.islemler.cozumleme.CozumlemeSeviyesi; import net.zemberek.tr.yapi.TurkiyeTurkcesi; import net.zemberek.yapi.Kelime; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.util.AttributeSource; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; /** * Factory for {@link Zemberek2DeASCIIfyFilter}. */ public class Zemberek2DeASCIIfyFilterFactory extends TokenFilterFactory { private final Zemberek zemberek = new Zemberek(new TurkiyeTurkcesi()); static final String DEASCII_TOKEN_TYPE = "<DEASCII>"; public Zemberek2DeASCIIfyFilterFactory(Map<String, String> args) { super(args); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } } @Override public TokenStream create(TokenStream input) { return new Zemberek2DeASCIIfyFilter(input); } /** * DeASCIIfier based on <a href="https://code.google.com/p/zemberek">Zemberek2</a> * Modified from <a href="http://www.docjar.com/html/api/org/apache/lucene/wordnet/SynonymTokenFilter.java.html"> * org.apache.lucene.wordnet.SynonymTokenFilter</a> */ private final class Zemberek2DeASCIIfyFilter extends TokenFilter { private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private String[] stack = null; private int index = 0; private AttributeSource.State current = null; private int todo = 0; public Zemberek2DeASCIIfyFilter(TokenStream input) { super(input); } @Override public boolean incrementToken() throws IOException { while (todo > 0 && index < stack.length) { // pop from stack if (createToken(stack[index++], current)) { todo--; return true; } } if (!input.incrementToken()) return false; if (keywordAttribute.isKeyword()) return true; // stack = zemberek.asciidenTurkceye(termAttribute.toString()); Kelime[] kelimeler = zemberek.asciiToleransliCozumleyici().cozumle(termAttribute.toString(), CozumlemeSeviyesi.TUM_KOKLER); Arrays.sort(kelimeler, new KelimeKokFrekansKiyaslayici()); ArrayList<String> olusumlar = new ArrayList<>(kelimeler.length); for (Kelime kelime : kelimeler) { String olusum = kelime.icerikStr(); if (!olusumlar.contains(olusum)) olusumlar.add(olusum); } olusumlar.remove(termAttribute.toString()); stack = olusumlar.toArray(new String[olusumlar.size()]); index = 0; current = captureState(); todo = stack.length; return true; } private boolean createToken(String synonym, AttributeSource.State current) { restoreState(current); termAttribute.setEmpty().append(synonym); typeAtt.setType(DEASCII_TOKEN_TYPE); posIncrAtt.setPositionIncrement(0); return true; } @Override public void reset() throws IOException { super.reset(); stack = null; index = 0; current = null; todo = 0; } } public static void main(String[] args) throws IOException { StringReader reader = new StringReader("kus asisi ortaklar çekişme masali"); Map<String, String> map = new HashMap<>(); Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map); WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(); whitespaceTokenizer.setReader(reader); TokenStream stream = factory.create(whitespaceTokenizer); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttribute.toString(); System.out.println(term); } stream.end(); reader.close(); } }