package org.xbib.elasticsearch.index.analysis.sortform;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.util.regex.Pattern;
/**
*
*/
public class SortformTokenFilter extends TokenFilter {
private static final Pattern[] patterns = {
Pattern.compile("\\s*<<.*?>>\\s*"),
Pattern.compile("\\s*<.*?>\\s*"),
Pattern.compile("\\s*\u0098.*?\u009C\\s*"),
Pattern.compile("\\s*\u02BE.*?\u02BB\\s*"),
Pattern.compile("\\s*\u00AC.*?\u00AC\\s*")
};
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
protected SortformTokenFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
} else {
String s = termAtt.toString();
for (Pattern pattern : patterns) {
s = pattern.matcher(s).replaceAll("");
}
termAtt.setEmpty().append(s);
return true;
}
}
@Override
public boolean equals(Object object) {
return object instanceof SortformTokenFilter;
}
@Override
public int hashCode() {
return 0;
}
}