TextOffsetTokenStream.java example

Explorer
lux-master
- src
package lux.index.analysis;

import java.io.IOException;

import lux.xml.Offsets;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmNodeKind;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;

/**
 * <p>This TokenStream records the offsets (character positions in the original text) of every token.
 * It records the start offset of each text node, and whenever there is a difference between the 
 * length of the serialized XML and the length of the text, it records the offset just after the 
 * discrepancy.  For example if a character entity (like &amp;) occurs in the XML, this is translated
 * to "&" in the text, and a character offset is recorded for the character just following the "&".
 * </p>
 */
public abstract class TextOffsetTokenStream extends XmlTokenStreamBase {

    private int iText;
    private int iDelta;
    private Offsets offsets;
    
    private CharSequenceStream charSequenceStream;

    public TextOffsetTokenStream(String fieldName, Analyzer analyzer, TokenStream wrapped, XdmNode doc, Offsets offsets, Processor processor) {
        super(fieldName, analyzer, wrapped, processor);
        //charSequenceStream = new CharSequenceStream(null);
        //charStream = new OffsetCharFilter(charSequenceStream);
        this.offsets = offsets;
        iText = 0;
        iDelta = 0;
    }

    @Override
    protected boolean resetTokenizer(CharSequence text) {
        charSequenceStream = new CharSequenceStream(text);
        OffsetCharFilter offsetCharFilter = null;
        if (offsets != null) {
            charStream = offsetCharFilter = new OffsetCharFilter(charSequenceStream);
            updateOffsets (offsetCharFilter, text.length());
        } else {
            charStream = charSequenceStream;
        }
        try {
            reset ();
            // this is what we had before refactoring:
            return incrementWrappedTokenStream();
            // but shouldn't it really be this?:
            // return incrementToken();
        } catch (IOException e) {
            return false;
        }
    }

    private void updateOffsets (OffsetCharFilter offsetCharFilter, int length) {
        if (curNode.getNodeKind() == XdmNodeKind.TEXT && offsets != null) {
            int location = offsets.getTextLocation(iText++); // location in the original XML
            offsetCharFilter.addOffset(0, location);
            // skip over any deltas preceding this text
            int deltaLocation = offsets.getDeltaLocation(iDelta);
            while (iDelta < offsets.getDeltaCount() && deltaLocation < location) {
                deltaLocation = offsets.getDeltaLocation(++iDelta);
            }
            // apply all the deltas occurring within this text
            while (iDelta < offsets.getDeltaCount()) {
                // accumulate the deltas
                location += offsets.getDelta(iDelta);
                // calculate the offset within this text (not the original XML-encoded text) where the delta is
                int dOff = deltaLocation - location;
                if (dOff > length) {
                    break;
                }
                // the offset at dOff is the difference between the original position and dOff
                offsetCharFilter.addOffset(dOff, location);
                deltaLocation = offsets.getDeltaLocation(++iDelta);
            }
        }
    }
}

/*
 * This Source Code Form is subject to the terms of the Mozilla Public License,
 * v. 2.0. If a copy of the MPL was not distributed with this file, You can
 * obtain one at http://mozilla.org/MPL/2.0/.
 */