package com.tyndalehouse.step.core.data.filters; import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Cleans up transliterations by removing any extra character * * @author chrisburrell * */ public class BetaTransliterationCleaningFilter extends TokenFilter { private final TermAttribute termAtt; /** * @param input the token stream */ public BetaTransliterationCleaningFilter(final TokenStream input) { super(input); this.termAtt = addAttribute(TermAttribute.class); } @Override public boolean incrementToken() throws IOException { if (this.input.incrementToken()) { final char[] buffer = this.termAtt.termBuffer(); final StringBuilder buf = new StringBuilder(buffer.length); char lastChar = 0x0; for (int i = 0; i < this.termAtt.termLength(); i++) { // skip two characters in a row final char currentChar = buffer[i]; if (lastChar == currentChar) { continue; } lastChar = currentChar; appendNonBetaSpecialChar(buffer, buf, i, currentChar); } if (buf.length() != buffer.length) { final char[] output = new char[buf.length()]; buf.getChars(0, buf.length(), output, 0); this.termAtt.setTermBuffer(output, 0, output.length); } return true; } else { return false; } } /** * Appends a character if it does not form part of the BETA spec * * @param buffer the word that is being transliterated * @param buf the builder we are using to build up the transliteration * @param i the current position * @param currentChar our current character */ // CHECKSTYLE:OFF private void appendNonBetaSpecialChar(final char[] buffer, final StringBuilder buf, final int i, final char currentChar) { // CHECKSTYLE:ON // caters for the beta code as well switch (currentChar) { case '-': case '\'': case '/': case '\\': case ')': case '(': case '=': case '*': case '+': case '|': case '&': break; default: buf.append(buffer[i]); } } }