/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.morfologik; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.CharsRefBuilder; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.IStemmer; import morfologik.stemming.WordData; import morfologik.stemming.polish.PolishStemmer; /** * {@link TokenFilter} using Morfologik library to transform input tokens into lemma and * morphosyntactic (POS) tokens. Applies to Polish only. * * <p>MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic * annotations for produced lemmas. See the Morfologik documentation for details.</p> * * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a> */ public class MorfologikFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); private final CharsRefBuilder scratch = new CharsRefBuilder(); private State current; private final TokenStream input; private final IStemmer stemmer; private List<WordData> lemmaList; private final ArrayList<StringBuilder> tagsList = new ArrayList<>(); private int lemmaListIndex; /** * Creates a filter with the default (Polish) dictionary. */ public MorfologikFilter(final TokenStream in) { this(in, new PolishStemmer().getDictionary()); } /** * Creates a filter with a given dictionary. * * @param in input token stream. * @param dict Dictionary to use for stemming. */ public MorfologikFilter(final TokenStream in, final Dictionary dict) { super(in); this.input = in; this.stemmer = new DictionaryLookup(dict); this.lemmaList = Collections.emptyList(); } /** * A pattern used to split lemma forms. */ private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|"); private void popNextLemma() { // One tag (concatenated) per lemma. final WordData lemma = lemmaList.get(lemmaListIndex++); termAtt.setEmpty().append(lemma.getStem()); CharSequence tag = lemma.getTag(); if (tag != null) { String[] tags = lemmaSplitter.split(tag.toString()); for (int i = 0; i < tags.length; i++) { if (tagsList.size() <= i) { tagsList.add(new StringBuilder()); } StringBuilder buffer = tagsList.get(i); buffer.setLength(0); buffer.append(tags[i]); } tagsAtt.setTags(tagsList.subList(0, tags.length)); } else { tagsAtt.setTags(Collections.<StringBuilder> emptyList()); } } /** * Lookup a given surface form of a token and update * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. */ private boolean lookupSurfaceForm(CharSequence token) { lemmaList = this.stemmer.lookup(token); lemmaListIndex = 0; return lemmaList.size() > 0; } /** Retrieves the next token (possibly from the list of lemmas). */ @Override public final boolean incrementToken() throws IOException { if (lemmaListIndex < lemmaList.size()) { restoreState(current); posIncrAtt.setPositionIncrement(0); popNextLemma(); return true; } else if (this.input.incrementToken()) { if (!keywordAttr.isKeyword() && (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) { current = captureState(); popNextLemma(); } else { tagsAtt.clear(); } return true; } else { return false; } } /** * Convert to lowercase in-place. */ private CharSequence toLowercase(CharSequence chs) { final int length = chs.length(); scratch.setLength(length); scratch.grow(length); char buffer[] = scratch.chars(); for (int i = 0; i < length;) { i += Character.toChars( Character.toLowerCase(Character.codePointAt(chs, i)), buffer, i); } return scratch.get(); } /** Resets stems accumulator and hands over to superclass. */ @Override public void reset() throws IOException { lemmaListIndex = 0; lemmaList = Collections.emptyList(); tagsList.clear(); super.reset(); } }