// -*- c-basic-offset: 2 -*- package org.apache.lucene.analysis.morfologik; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.*; import morfologik.stemming.*; import morfologik.stemming.PolishStemmer.DICTIONARY; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.*; /** * {@link TokenFilter} using Morfologik library. * * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic * annotations for produced lemmas. See the Morfologik documentation for details. * * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a> */ public class MorfologikFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final CharsRef scratch = new CharsRef(0); private final CharacterUtils charUtils; private State current; private final TokenStream input; private final IStemmer stemmer; private List<WordData> lemmaList; private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>(); private int lemmaListIndex; /** * Builds a filter for given PolishStemmer.DICTIONARY enum. * * @param in input token stream * @param dict PolishStemmer.DICTIONARY enum * @param version Lucene version compatibility for lowercasing. */ public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) { super(in); this.input = in; this.stemmer = new PolishStemmer(dict); this.charUtils = CharacterUtils.getInstance(version); this.lemmaList = Collections.emptyList(); } private void popNextLemma() { // Collect all tags for the next unique lemma. CharSequence currentStem; int tags = 0; do { final WordData lemma = lemmaList.get(lemmaListIndex++); currentStem = lemma.getStem(); final CharSequence tag = lemma.getTag(); if (tag != null) { if (tagsList.size() <= tags) { tagsList.add(new StringBuilder()); } final StringBuilder buffer = tagsList.get(tags++); buffer.setLength(0); buffer.append(lemma.getTag()); } } while (lemmaListIndex < lemmaList.size() && equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem)); // Set the lemma's base form and tags as attributes. termAtt.setEmpty().append(currentStem); tagsAtt.setTags(tagsList.subList(0, tags)); } /** * Compare two char sequences for equality. Assumes non-null arguments. */ private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) { int len1 = s1.length(); int len2 = s2.length(); if (len1 != len2) return false; for (int i = len1; --i >= 0;) { if (s1.charAt(i) != s2.charAt(i)) { return false; } } return true; } /** * Lookup a given surface form of a token and update * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. */ private boolean lookupSurfaceForm(CharSequence token) { lemmaList = this.stemmer.lookup(token); lemmaListIndex = 0; return lemmaList.size() > 0; } /** Retrieves the next token (possibly from the list of lemmas). */ @Override public final boolean incrementToken() throws IOException { if (lemmaListIndex < lemmaList.size()) { restoreState(current); posIncrAtt.setPositionIncrement(0); popNextLemma(); return true; } else if (this.input.incrementToken()) { if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) { current = captureState(); popNextLemma(); } else { tagsAtt.clear(); } return true; } else { return false; } } /** * Convert to lowercase in-place. */ private CharSequence toLowercase(CharSequence chs) { final int length = scratch.length = chs.length(); scratch.grow(length); char buffer[] = scratch.chars; for (int i = 0; i < length;) { i += Character.toChars( Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i); } return scratch; } /** Resets stems accumulator and hands over to superclass. */ @Override public void reset() throws IOException { lemmaListIndex = 0; lemmaList = Collections.emptyList(); tagsList.clear(); super.reset(); } }