ICUTokenizer.java example

Explorer

elasticsearch-analysis-german-master
- src
  - main
    - java
      - org
        xbib
        elasticsearch
        index
        analysis
        baseform
        BaseformTokenFilter.java
        BaseformTokenFilterFactory.java
        ConstantArcSizeFSA.java
        Dictionary.java
        FSA.java
        FSABuilder.java
        FSAFinalStatesIterator.java
        FSAFlags.java
        FSATraversal.java
        MatchResult.java
        StateVisitor.java
        combo
        BufferedReaderUnwrapper.java
        CachingTokenStream.java
        CharArrayReaderCloner.java
        ComboAnalysisBinderProcessor.java
        ComboAnalyzer.java
        ComboAnalyzerProvider.java
        ComboAnalyzerWrapper.java
        ComboTokenStream.java
        FilterReaderUnwrapper.java
        PositionedTokenStream.java
        ReaderCloneFactory.java
        ReaderClonerDefaultImpl.java
        ReusableStringReader.java
        ReusableStringReaderCloner.java
        ReusableTokenStreamComponents.java
        StringReaderCloner.java
        concat
        ConcatTokenFilter.java
        ConcatTokenFilterFactory.java
        decompound
        CompactPatriciaTrie.java
        DecompoundTokenFilter.java
        DecompoundTokenFilterFactory.java
        Decompounder.java
        Node.java
        german
        GermanAnalysisBinderProcessor.java
        GermanNormalizationFilterFactory.java
        hyphen
        HyphenAnalyzer.java
        HyphenAnalyzerProvider.java
        HyphenTokenFilter.java
        HyphenTokenFilterFactory.java
        HyphenTokenizer.java
        HyphenTokenizerFactory.java
        icu
        IcuAnalysisBinderProcessor.java
        IcuCollationKeyAnalyzer.java
        IcuCollationKeyAnalyzerProvider.java
        IcuCollationTokenizerFactory.java
        IcuFoldingTokenFilterFactory.java
        IcuNormalizerCharFilterFactory.java
        IcuNormalizerTokenFilterFactory.java
        IcuTokenizerFactory.java
        IcuTransformTokenFilterFactory.java
        segmentation
        BreakIteratorWrapper.java
        CharArrayIterator.java
        CompositeBreakIterator.java
        DefaultICUTokenizerConfig.java
        ICUTokenizer.java
        ICUTokenizerConfig.java
        ScriptIterator.java
        langdetect
        LangProfile.java
        LangdetectMapper.java
        LangdetectModule.java
        LangdetectService.java
        Language.java
        LanguageDetectionException.java
        NGram.java
        RegisterLangdetectType.java
        sortform
        SortformAnalyzerProvider.java
        SortformTokenFilter.java
        SortformTokenFilterFactory.java
        standardnumber
        StandardNumberAnalyzer.java
        StandardNumberAnalyzerProvider.java
        StandardNumberService.java
        StandardNumberTokenFilter.java
        StandardNumberTokenFilterFactory.java
        worddelimiter
        WordDelimiterFilter.java
        WordDelimiterFilter2.java
        WordDelimiterFilter2Factory.java
        WordDelimiterFilterFactory.java
        WordDelimiterFlags.java
        WordDelimiterIterator.java
        year
        GregorianYearTokenFilter.java
        GregorianYearTokenFilterFactory.java
        plugin
        analysis
        german
        AnalysisGermanPlugin.java
        Build.java
        standardnumber
        ARK.java
        AbstractStandardNumber.java
        DOI.java
        EAN.java
        GTIN.java
        IBAN.java
        ISAN.java
        ISBN.java
        ISMN.java
        ISNI.java
        ISSN.java
        ISTC.java
        ISWC.java
        ORCID.java
        PPN.java
        SICI.java
        StandardNumber.java
        UPC.java
        ZDB.java
        check
        Digit.java
        DihedralGroup.java
        LuhnMOD10.java
        iso7064
        MOD1110.java
        MOD112.java
        MOD3736.java
        MOD9710.java
  - test
    - java
      - org
        xbib
        elasticsearch
        index
        analysis
        AssertingIndexSearcher.java
        BaseTokenStreamTest.java
        HexDump.java
        LuceneTestCase.java
        MockAnalyzer.java
        MockTokenFilter.java
        MockTokenizer.java
        baseform
        BaseformTokenFilterTests.java
        combo
        ComboAnalyzerTests.java
        ComboTokenStreamTests.java
        ReaderContent.java
        ReusableStringReaderClonerTests.java
        SimpleComboAnalysisTests.java
        StringReaderClonerTests.java
        decompound
        DecompoundTokenFilterTests.java
        german
        GermanNormalizationTests.java
        SettingsTests.java
        hyphen
        HyphenTokenizerTests.java
        icu
        CollationTestBase.java
        ICUCollationKeyAnalyzerTests.java
        IcuAnalysisTests.java
        IcuCollationAnalyzerTests.java
        IcuTokenizerTests.java
        langdetect
        LangdetectMappingTests.java
        sortform
        SortFormTests.java
        worddelimiter
        WordDelimiterFilter2Tests.java

package org.xbib.elasticsearch.index.analysis.icu.segmentation;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;

import java.io.IOException;
import java.io.Reader;

/**
 * Breaks text into words according to UAX #29: Unicode Text Segmentation
 * (http://www.unicode.org/reports/tr29/)
 * <p/>
 * Words are broken across script boundaries, then segmented according to
 * the BreakIterator and typing provided by the {@link IcuTokenizerConfig}
 *
 * @see IcuTokenizerConfig
 */
public final class IcuTokenizer extends Tokenizer {

    private static final int IOBUFFER = 4096;

    private final char buffer[] = new char[IOBUFFER];
    /**
     * true length of text in the buffer
     */
    private int length = 0;
    /**
     * length in buffer that can be evaluated safely, up to a safe end point
     */
    private int usableLength = 0;
    /**
     * accumulated offset of previous buffers for this reader, for offsetAtt
     */
    private int offset = 0;

    private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
    private final IcuTokenizerConfig config;
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class);

    /**
     * Construct a new ICUTokenizer that breaks text into words from the given
     * Reader.
     * <p/>
     * The default script-specific handling is used.
     * <p/>
     * The default attribute factory is used.
     *
     * @param input Reader containing text to tokenize.
     * @see DefaultIcuTokenizerConfig
     */
    public IcuTokenizer(Reader input) {
        this(input, new DefaultIcuTokenizerConfig(true));
    }

    /**
     * Construct a new ICUTokenizer that breaks text into words from the given
     * Reader, using a tailored BreakIterator configuration.
     * <p/>
     * The default attribute factory is used.
     *
     * @param input  Reader containing text to tokenize.
     * @param config Tailored BreakIterator configuration
     */
    public IcuTokenizer(Reader input, IcuTokenizerConfig config) {
        this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, input, config);
    }

    /**
     * Construct a new ICUTokenizer that breaks text into words from the given
     * Reader, using a tailored BreakIterator configuration.
     *
     * @param factory AttributeFactory to use
     * @param input   Reader containing text to tokenize.
     * @param config  Tailored BreakIterator configuration
     */
    public IcuTokenizer(AttributeFactory factory, Reader input, IcuTokenizerConfig config) {
        super(factory, input);
        this.config = config;
        breaker = new CompositeBreakIterator(config);
    }

    @Override
    public boolean incrementToken() throws IOException {
        clearAttributes();
        if (length == 0) {
            refill();
        }
        while (!incrementTokenBuffer()) {
            refill();
            if (length <= 0) // no more bytes to read;
            {
                return false;
            }
        }
        return true;
    }

    @Override
    public void reset() throws IOException {
        super.reset();
        breaker.setText(buffer, 0, 0);
        length = usableLength = offset = 0;
    }

    @Override
    public void end() throws IOException {
        super.end();
        final int finalOffset = (length < 0) ? offset : offset + length;
        offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
    }

  /*
   * This tokenizes text based upon the longest matching rule, and because of 
   * this, isn't friendly to a Reader.
   * 
   * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
   * text, the last unambiguous break point is found (in this implementation:
   * white space character) Any remaining characters represent possible partial
   * words, so are appended to the front of the next chunk.
   * 
   * There is the possibility that there are no unambiguous break points within
   * an entire 4kB chunk of text (binary data). So there is a maximum word limit
   * of 4kB since it will not try to grow the buffer in this case.
   */

    /**
     * Returns the last unambiguous break position in the text.
     *
     * @return position of character, or -1 if one does not exist
     */
    private int findSafeEnd() {
        for (int i = length - 1; i >= 0; i--) {
            if (UCharacter.isWhitespace(buffer[i])) {
                return i + 1;
            }
        }
        return -1;
    }

    /**
     * Refill the buffer, accumulating the offset and setting usableLength to the
     * last unambiguous break position
     *
     * @throws IOException If there is a low-level I/O error.
     */
    private void refill() throws IOException {
        offset += usableLength;
        int leftover = length - usableLength;
        System.arraycopy(buffer, usableLength, buffer, 0, leftover);
        int requested = buffer.length - leftover;
        int returned = read(input, buffer, leftover, requested);
        length = returned + leftover;
        if (returned < requested) /* reader has been emptied, process the rest */ {
            usableLength = length;
        } else { /* still more data to be read, find a safe-stopping place */
            usableLength = findSafeEnd();
            if (usableLength < 0) {
                usableLength = length; /*
                                * more than IOBUFFER of text without space,
                                * gonna possibly truncate tokens
                                */
            }
        }
        breaker.setText(buffer, 0, Math.max(0, usableLength));
    }

    /**
     * commons-io's readFully, but without bugs if offset != 0
     */
    private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
        assert length >= 0 : "length must not be negative: " + length;
        int remaining = length;
        while (remaining > 0) {
            int location = length - remaining;
            int count = input.read(buffer, offset + location, remaining);
            if (-1 == count) { // EOF
                break;
            }
            remaining -= count;
        }
        return length - remaining;
    }

    /*
     * return true if there is a token from the buffer, or null if it is
     * exhausted.
     */
    private boolean incrementTokenBuffer() {
        int start = breaker.current();
        if (start == BreakIterator.DONE) {
            return false; // BreakIterator exhausted
        }
        // find the next set of boundaries, skipping over non-tokens (rule status 0)
        int end = breaker.next();
        while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
            start = end;
            end = breaker.next();
        }
        if (start == BreakIterator.DONE) {
            return false; // BreakIterator exhausted
        }
        termAtt.copyBuffer(buffer, start, end - start);
        offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
        typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus()));
        scriptAtt.setCode(breaker.getScriptCode());
        return true;
    }
}