package lux.xml;
import java.util.Arrays;
import lux.index.analysis.XmlTextTokenStream;
/**
* This class stores the offsets of text nodes and character entities in serialized XML. It is
* populated by {@link SaxonDocBuilder} and consumed by {@link XmlTextTokenStream}.
*/
public final class Offsets {
private int iOffset;
private int iDelta;
private int[] textOffsets;
private int[] deltaLocations;
private short[] deltas;
public Offsets () {
textOffsets = new int[1024];
deltas = new short[1024];
deltaLocations = new int[1024];
reset ();
}
public void reset () {
iOffset = iDelta = 0;
}
// store the character offsets of every character reference or entity in the document text,
// along with the difference between the length of the entity reference and its replacement text.
// eg, for an & appearing at position 100, we would store (100, 4), since len('&')=5, and len ('&')=1.
public void addDelta(int deltaLocation, short delta) {
if (iDelta >= deltas.length) {
deltas = Arrays.copyOf(deltas, deltas.length + 1024);
deltaLocations = Arrays.copyOf(deltaLocations, deltaLocations.length + 1024);
}
deltaLocations[iDelta] = deltaLocation;
deltas[iDelta++] = delta;
}
// store the character offsets of all of the text nodes in the document: According to StAX javadocs,
// these will either be bytes or they will be characters, depending on whether the parser
// was fed a byte stream or a character stream! However in practice we seem to get character
// offsets in both cases??
public void addOffset(int characterOffset) {
// StAX documentation claims this may be a byte offset when fed a byte stream, but
// that doesn't seem to be the case?
if (iOffset >= textOffsets.length) {
textOffsets = Arrays.copyOf(textOffsets, textOffsets.length + 1024);
}
textOffsets[iOffset++] = characterOffset;
}
/**
* @param i the index of the text node
* @return the character location in the input character stream of i'th text node.
*/
public int getTextLocation (int i) {
return textOffsets[i];
}
/**
* A delta is stored whenever the number of characters in the output token is not the same
* as the number in the input character stream.
* @param i the index of the delta
* @return the character location in the input character stream of the i'th delta.
*/
public int getDeltaLocation (int i) {
return deltaLocations[i];
}
/**
* @return the number of deltas found in the input stream
*/
public int getDeltaCount() {
return iDelta;
}
/**
* @param i the index of the delta
* @return the value of the i'th delta
*/
public int getDelta(int i) {
return deltas[i];
}
}
/*
* This Source Code Form is subject to the terms of the Mozilla Public License,
* v. 2.0. If a copy of the MPL was not distributed with this file, You can
* obtain one at http://mozilla.org/MPL/2.0/.
*/