LookAroundCharTokenizer.java example

Explorer

folioxml-master
- commandline
  - src
    - folioxml
      - command
        Main.java
      - export
        ExportRunner.java
  - testsrc
    - folioxml
      - export
        TestExportRunner.java
- contrib
  - folioxml-lucene
    - src
      - folioxml
        export
        plugins
        ResolveHyperlinks.java
        lucene
        FieldCollector.java
        IndexFieldOpts.java
        IndexFieldOptsProvider.java
        InfobaseFieldOptsSet.java
        InfobaseSetIndexer.java
        analysis
        AnalyzerPicker.java
        DynamicAnalyzer.java
        ListAnalyzer.java
        ListTokenizer.java
        LowercaseKeywordAnalyzer.java
        folio
        FolioEnuAnalyzer.java
        FolioEnuPhraseAnalyzer.java
        FolioEnuTokenizer.java
        LookAroundCharTokenizer.java
        TokenCombiner.java
        folioQueryParser
        QueryParser.java
        QueryToken.java
        QueryTokenReader.java
    - testsrc
      - apache
        lucene
        CharTokenizer.java
      - folioxml
        directexport
        SimultaneousTest.java
        lucene
        analysis
        folio
        TokenCombinerTest.java
        folioQueryParser
        QueryParserTest.java
        tests
        Indexer.java
- core
  - folioxml
- diff_match_patch
  - oldtest
    - name
      - fraser
        neil
        plaintext
        diff_match_patch_test.java
  - src
    - name
      - fraser
        neil
        plaintext
        diff_match_patch.java

package folioxml.lucene.analysis.folio;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import java.io.IOException;


public abstract class LookAroundCharTokenizer extends Tokenizer {

    public LookAroundCharTokenizer() {
        super();
    }

    /**
     * Returns true iff a codepoint should be included in a token. This tokenizer
     * generates as tokens adjacent sequences of codepoints which satisfy this
     * predicate. Codepoints for which this is false are used to define token
     * boundaries and are not included in tokens.
     * <p>
     * Values previous and next may be -1 at the beginning or end of a stream, respectively.
     */
    protected abstract boolean isTokenChar(int previous, int c, int next);

    /**
     * Called on each token character to normalize it before it is added to the
     * token. The default implementation does nothing. Subclasses may use this to,
     * e.g., lowercase tokens.
     */
    protected abstract int normalize(int c);


    private static final int MAX_WORD_LEN = 1024;
    private static final int IO_BUFFER_SIZE = 4096;

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

    private int offset = 0, finalOffset = 0;
    private int cPrev = -1;
    private int c = -1;
    private int cNext = -1;

    @Override
    public final boolean incrementToken() throws IOException {
        clearAttributes();
        int length = 0;
        int start = -1; // We will set this to the index of the first valid token char we find.
        char[] buffer = termAtt.buffer(); //Destination buffer
        while (true) {
            cPrev = c;
            c = cNext;
            cNext = input.read(); //No support for surrogates here!
            offset++;

            if (length > 1 && c == -1) break; //We hit the end of the input for this token
            if (offset > 1 && c == -1) return false; //We don't have a token, nothing to do.

            if (offset > 1 && isTokenChar(cPrev, c, cNext)) {               // if it's a token char
                if (length == 0) {                // start of token
                    assert start == -1;
                    start = offset - 2;
                } else if (length >= buffer.length - 1) { // check if a supplementary could run out of bounds
                    buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer
                }
                length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
                if (length >= MAX_WORD_LEN)
                    break; // buffer overflow! make sure to check for >= surrogate pair could break == test
            } else if (length > 0) {             // at non-Letter w/ chars
                break;                           // return 'em
            }
        }

        termAtt.setLength(length);
        assert start != -1;
        offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length));
        return true;

    }

    @Override
    public void end() throws IOException {
        // set final offset
        offsetAtt.setOffset(finalOffset, finalOffset);
        super.end();
    }

    @Override
    public void reset() throws IOException {
        super.reset();
        offset = 0;
        finalOffset = 0;
        cPrev = -1;
        c = -1;
        cNext = -1;
    }


}