TransliterationCleaningFilter.java example

Explorer
step-master
package com.tyndalehouse.step.core.data.filters;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;

/**
 * Cleans up transliterations by removing any extra character
 * 
 * @author chrisburrell
 * 
 */
public class TransliterationCleaningFilter extends TokenFilter {
    private final TermAttribute termAtt;

    /**
     * @param input the token stream
     */
    public TransliterationCleaningFilter(final TokenStream input) {
        super(input);
        this.termAtt = addAttribute(TermAttribute.class);
    }

    @Override
    public boolean incrementToken() throws IOException {
        if (this.input.incrementToken()) {
            final char[] buffer = this.termAtt.termBuffer();

            final StringBuilder buf = new StringBuilder(buffer.length);

            char lastChar = 0x0;
            for (int i = 0; i < this.termAtt.termLength(); i++) {
                // skip two characters in a row
                final char currentChar = buffer[i];
                if (lastChar == currentChar) {
                    continue;
                }
                lastChar = currentChar;

                // caters for the beta code as well
                switch (currentChar) {
                    case '-':
                    case '*':
                    case '\'':
                        break;
                    default:
                        buf.append(buffer[i]);
                }
            }

            if (buf.length() != buffer.length) {
                final char[] output = new char[buf.length()];
                buf.getChars(0, buf.length(), output, 0);
                this.termAtt.setTermBuffer(output, 0, output.length);
            }
            return true;
        } else {
            return false;
        }
    }
}