/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.apache.lucene.analysis.core;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
*
* @author jluker
*
* Checks if the token is an acronym (is present in the HashMap) and
* adds the full name after the term (depends on the value of emitBoth)
*/
public final class AcronymTokenFilter extends TokenFilter {
public static final int ACRONYM_MIN_LENGTH = 2;
public static final float ACRONYM_UPPER_MIN_RATIO = 0.74999f;
// controls index-time vs. query-time behavior
private boolean emitBoth;
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private State currentState = null;
private String prefix;
private String tokenType;
public AcronymTokenFilter(TokenStream in, boolean emitBoth, String prefix, String tokenType) {
super(in);
this.emitBoth = emitBoth;
this.prefix = prefix;
this.tokenType = tokenType;
}
@Override
public boolean incrementToken() throws IOException {
// check if we got an acronym
if (this.currentState != null) {
restoreState(currentState);
if (this.tokenType != null) {
typeAtt.setType(this.tokenType);
}
posIncrAtt.setPositionIncrement(0);
currentState=null;
return true;
}
if (!input.incrementToken()) {
return false;
}
String origTerm = termAtt.toString();
if (termIsAcronym(origTerm)) {
if (prefix != null) {
termAtt.setEmpty().append(prefix+origTerm);
}
if (!emitBoth) {
if (this.tokenType != null) {
typeAtt.setType(this.tokenType);
}
return true;
}
currentState = captureState();
termAtt.setEmpty().append(origTerm);
}
return true;
}
/**
* Checks that the string is considered a valid acronoym. Usually all letters
* must be UPPERCASE (there is a minimum ration)
*
* @param term
* string to be checked
* @return true when the term has only UPPERCASE and digits
*/
public static boolean termIsAcronym(String term) {
if (term.length() < ACRONYM_MIN_LENGTH ) {
return false;
}
int u = 0;
int d = 0;
int l = term.length();
for (char c: term.toCharArray()) {
if (Character.isUpperCase(c)) {
u++;
}
else if (Character.isDigit(c)) {
d++;
}
}
if (d==l) {
return false;
}
if (u < ACRONYM_MIN_LENGTH)
return false;
if ( (((float) u+ ((float) d / 2.0)) / term.length()) < ACRONYM_UPPER_MIN_RATIO)
return false;
return true;
}
public void reset() throws IOException {
super.reset();
currentState=null;
}
}