package org.gbif.checklistbank.lucene; import org.gbif.checklistbank.utils.SciNameNormalizer; import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** * Lucene filter that uses the SciNameNormalizer util to normalize scientific names. * By default it strongly normalizes all name tokens as generated by the Standard- or WhitespaceTokenizer. * * When using it with tokens containing whitespace and representing entire scientific names as generated by the KeywordTokenizer * one should set fullNameTokens=true to keep monomials & genera mostly untouched. */ public class ScientificNameNormalizerFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final boolean fullNameTokens; private final boolean stemming; /** * Construct a token stream filtering the given input. */ public ScientificNameNormalizerFilter(TokenStream input) { this(input, false, true); } /** * @param fullNameTokens if true the name normalization is done on the entire token. If false just on the epithets, leaving the genus part untouched * @param stemming if true gender stemming of tokens is done */ public ScientificNameNormalizerFilter(TokenStream input, boolean fullNameTokens, boolean stemming) { super(input); this.fullNameTokens = fullNameTokens; this.stemming = stemming; } @Override public final boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; String term = termAtt.toString(); if (SciNameNormalizer.hasContent(term)) { String normed = fullNameTokens ? SciNameNormalizer.normalize(term, stemming) : SciNameNormalizer.normalizeAll(term, stemming); termAtt.copyBuffer(normed.toCharArray(), 0, normed.length()); } return true; } }