MorfologikFilter.java example

Explorer
solr-analytics-master
- lucene
- solr
// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.*;

import morfologik.stemming.*;
import morfologik.stemming.PolishStemmer.DICTIONARY;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.*;

/**
 * {@link TokenFilter} using Morfologik library.
 *
 * MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
 * annotations for produced lemmas. See the Morfologik documentation for details.
 * 
 * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
 */
public class MorfologikFilter extends TokenFilter {

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

  private final CharsRef scratch = new CharsRef(0);
  private final CharacterUtils charUtils;

  private State current;
  private final TokenStream input;
  private final IStemmer stemmer;
  
  private List<WordData> lemmaList;
  private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>();

  private int lemmaListIndex;

  /**
   * Builds a filter for given PolishStemmer.DICTIONARY enum.
   * 
   * @param in   input token stream
   * @param dict PolishStemmer.DICTIONARY enum
   * @param version Lucene version compatibility for lowercasing.
   */
  public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) {
    super(in);
    this.input = in;
    this.stemmer = new PolishStemmer(dict);
    this.charUtils = CharacterUtils.getInstance(version);
    this.lemmaList = Collections.emptyList();
  }

  private void popNextLemma() {
    // Collect all tags for the next unique lemma.
    CharSequence currentStem;
    int tags = 0;
    do {
      final WordData lemma = lemmaList.get(lemmaListIndex++);
      currentStem = lemma.getStem();
      final CharSequence tag = lemma.getTag();
      if (tag != null) {
        if (tagsList.size() <= tags) {
          tagsList.add(new StringBuilder());
        }

        final StringBuilder buffer = tagsList.get(tags++);  
        buffer.setLength(0);
        buffer.append(lemma.getTag());
      }
    } while (lemmaListIndex < lemmaList.size() &&
             equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));

    // Set the lemma's base form and tags as attributes.
    termAtt.setEmpty().append(currentStem);
    tagsAtt.setTags(tagsList.subList(0, tags));
  }

  /**
   * Compare two char sequences for equality. Assumes non-null arguments. 
   */
  private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
    int len1 = s1.length();
    int len2 = s2.length();
    if (len1 != len2) return false;
    for (int i = len1; --i >= 0;) {
      if (s1.charAt(i) != s2.charAt(i)) { 
        return false; 
      }
    }
    return true;
  }

  /**
   * Lookup a given surface form of a token and update 
   * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. 
   */
  private boolean lookupSurfaceForm(CharSequence token) {
      lemmaList = this.stemmer.lookup(token);
      lemmaListIndex = 0;
      return lemmaList.size() > 0;
  }

  /** Retrieves the next token (possibly from the list of lemmas). */
  @Override
  public final boolean incrementToken() throws IOException {
    if (lemmaListIndex < lemmaList.size()) {
      restoreState(current);
      posIncrAtt.setPositionIncrement(0);
      popNextLemma();
      return true;
    } else if (this.input.incrementToken()) {
      if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
        current = captureState();
        popNextLemma();
      } else {
        tagsAtt.clear();
      }
      return true;
    } else {
      return false;
    }
  }

  /**
   * Convert to lowercase in-place.
   */
  private CharSequence toLowercase(CharSequence chs) {
    final int length = scratch.length = chs.length();
    scratch.grow(length);

    char buffer[] = scratch.chars;
    for (int i = 0; i < length;) {
      i += Character.toChars(
          Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);      
    }

    return scratch;
  }

  /** Resets stems accumulator and hands over to superclass. */
  @Override
  public void reset() throws IOException {
    lemmaListIndex = 0;
    lemmaList = Collections.emptyList();
    tagsList.clear();
    super.reset();
  }
}