package lux.index.analysis;
import java.io.IOException;
import lux.xml.Offsets;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmNodeKind;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
/**
* <p>This TokenStream records the offsets (character positions in the original text) of every token.
* It records the start offset of each text node, and whenever there is a difference between the
* length of the serialized XML and the length of the text, it records the offset just after the
* discrepancy. For example if a character entity (like &) occurs in the XML, this is translated
* to "&" in the text, and a character offset is recorded for the character just following the "&".
* </p>
*/
public abstract class TextOffsetTokenStream extends XmlTokenStreamBase {
private int iText;
private int iDelta;
private Offsets offsets;
private CharSequenceStream charSequenceStream;
public TextOffsetTokenStream(String fieldName, Analyzer analyzer, TokenStream wrapped, XdmNode doc, Offsets offsets, Processor processor) {
super(fieldName, analyzer, wrapped, processor);
//charSequenceStream = new CharSequenceStream(null);
//charStream = new OffsetCharFilter(charSequenceStream);
this.offsets = offsets;
iText = 0;
iDelta = 0;
}
@Override
protected boolean resetTokenizer(CharSequence text) {
charSequenceStream = new CharSequenceStream(text);
OffsetCharFilter offsetCharFilter = null;
if (offsets != null) {
charStream = offsetCharFilter = new OffsetCharFilter(charSequenceStream);
updateOffsets (offsetCharFilter, text.length());
} else {
charStream = charSequenceStream;
}
try {
reset ();
// this is what we had before refactoring:
return incrementWrappedTokenStream();
// but shouldn't it really be this?:
// return incrementToken();
} catch (IOException e) {
return false;
}
}
private void updateOffsets (OffsetCharFilter offsetCharFilter, int length) {
if (curNode.getNodeKind() == XdmNodeKind.TEXT && offsets != null) {
int location = offsets.getTextLocation(iText++); // location in the original XML
offsetCharFilter.addOffset(0, location);
// skip over any deltas preceding this text
int deltaLocation = offsets.getDeltaLocation(iDelta);
while (iDelta < offsets.getDeltaCount() && deltaLocation < location) {
deltaLocation = offsets.getDeltaLocation(++iDelta);
}
// apply all the deltas occurring within this text
while (iDelta < offsets.getDeltaCount()) {
// accumulate the deltas
location += offsets.getDelta(iDelta);
// calculate the offset within this text (not the original XML-encoded text) where the delta is
int dOff = deltaLocation - location;
if (dOff > length) {
break;
}
// the offset at dOff is the difference between the original position and dOff
offsetCharFilter.addOffset(dOff, location);
deltaLocation = offsets.getDeltaLocation(++iDelta);
}
}
}
}
/*
* This Source Code Form is subject to the terms of the Mozilla Public License,
* v. 2.0. If a copy of the MPL was not distributed with this file, You can
* obtain one at http://mozilla.org/MPL/2.0/.
*/