/*
* Copyright 2008
* Richard Eckart de Castilho
* Institut für Sprach- und Literaturwissenschaft
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.api.transform.alignment;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.Set;
/**
* Allows to stack strings on top of each other and modifying each of them propagating changes up to
* the top while leaving lower levels unchanged.
*
* This class is not synchronized internally.
*
*/
public class AlignedString
implements Iterable<AlignedString.DataSegment>
{
// private static final Log _log = LogFactory.getLog(AlignedString.class);
private final AlignedString _underlying;
private final Set<AlignedString> _changeListeners;
protected final AnchorSegment _first;
protected final AnchorSegment _last;
private boolean _stringDirty = true;
private boolean _startDirty = true;
private String _content = null;
{
_first = new AnchorSegment(null, null);
_last = new AnchorSegment(null, null);
_changeListeners = new WeakHashSet<AlignedString>();
}
private AlignedString()
{
_underlying = null;
}
public AlignedString(final String base)
{
this(createBase(base));
}
private static AlignedString createBase(final String base)
{
final AlignedString d = new AlignedString();
d._first._next = d.new BaseSegment(d._first, d._last, base);
d._last._prev = d._first._next;
return d;
}
public AlignedString(final AlignedString underlying)
{
_underlying = underlying;
_underlying.addChangeListener(this);
_first._next = new ObliqueSegment(_first, _last, _underlying.getAnchor(0),
_underlying.getAnchor(_underlying.length()));
_last._prev = _first._next;
}
private void addChangeListener(final AlignedString l)
{
_changeListeners.add(l);
}
public void fireChange()
{
// if (!_startDirty) {
// System.out.println("startDirty true");
// }
_stringDirty = true;
_startDirty = true;
for (final AlignedString a : _changeListeners) {
a._stringDirty = true;
a._startDirty = true;
}
}
/**
* For the given interval on the current data, get the corresponding interval in the wrapped
* data.
*
* @param i
* an interval.
* @return the resolved interval.
*/
public Interval resolve(final Interval i)
{
if (_underlying == null) {
return i;
}
final DataSegment startSeg = getSegmentAt(i.getStart(), true);
// Subtract one here in order to get the segment that includes the
// last character still in the interval. Otherwise we will have the
// situation that startSeg may point into segment A, then follows a
// deleted segment B and endSeg will point into C. The resolved interval
// will thus contain A+B instead of only A.
final DataSegment endSeg = getSegmentAt(
(i.getStart() != i.getEnd()) ? i.getEnd() - 1 : i.getEnd(), true);
// For start find oblique segment here or to left.
// If none start is start of first segment.
DataSegment cursor = startSeg;
int start;
while (true) {
if (cursor == null) {
// If there is nothing start at the beginning
start = _underlying._first.getStart();
break;
}
else if (cursor instanceof ObliqueSegment) {
final ObliqueSegment oseg = (ObliqueSegment) cursor;
if (cursor == startSeg) {
// Calculate offset relative to the position of the
// ObliqueSegment if we did not need to move
final int pos = i.getStart() - oseg.getStart();
start = oseg._start.getPosition() + pos;
break;
}
else {
// If we had to move, use the end position of the
// ObliqueSegment.
start = oseg._end.getPosition();
break;
}
}
else {
cursor = cursor.getPrevious();
}
}
// For end find oblique segment here or to right.
// If none end is end of last segment of underlying data
cursor = endSeg;
int end;
while (true) {
if (cursor == null) {
// Maybe we should be in the start segment and just missed it
// Try to recover in this case instead of expanding to the
// end of the underlying data.
if ((startSeg instanceof ObliqueSegment) && (startSeg.length() >= i.getLength())) {
final ObliqueSegment oseg = (ObliqueSegment) startSeg;
final int pos = i.getEnd() - oseg.getStart();
end = oseg._start.getPosition() + pos;
}
else {
end = _underlying._last.getEnd();
}
break;
}
else if (cursor instanceof ObliqueSegment) {
final ObliqueSegment oseg = (ObliqueSegment) cursor;
if (cursor == endSeg) {
// Calculate offset relative to the position of the
// ObliqueSegment if we did not need to move
final int pos = i.getEnd() - oseg.getStart();
end = oseg._start.getPosition() + pos;
break;
}
else {
// If we had to move, use the end position of the
// ObliqueSegment
end = oseg._start.getPosition();
break;
}
}
else {
cursor = cursor.getNext();
}
}
if (end < start) {
throw new IllegalStateException("BUG: End [" + end
+ "] of resolved interval before start [" + start + "]!");
}
return new ImmutableInterval(start, end);
}
/**
* For the given interval on the underlying data, get the corresponding interval on this level.
*
* Example:
* 11 11 111 11
* 012 345 6789 01 23 456 78
* AD |111|22ZZ2|3333|44|55|YYY|55|
*
* UL |111|XX|22|ZZ|2|XXXXX|3333|XX|44|XXXX|5555|XXXX|
* 012 34 56 78 9 11111 1111 12 22 2222 2223 3333
* 01234 5678 90 12 3456 7890 1234
*
* As you can see there is a YYY inserted in the AD. Otherwise some parts of the UL (marked "X")
* have been removed in the AD. Also an ZZ part has been added to UL
*
* Calling this method with start=22 end=30 should return [12, 18] as this is the interval 5
* from UL plus the "Y" that has been inserted in AD.
*
* Generally:
* - if the start is within a deleted region, then find the next oblique segment in
* AD to the right and return its start position.
* - if the end is within a deleted region, then
* find the next oblique segment in AD to the left and return its end position.
*
* Anchors are always in UL. They are referenced from the ObliqueSegments in AD.
*
* @param i
* the interval on the underlying data.
* @return the corresponding interval in the view.
*/
public ImmutableInterval inverseResolve(final ImmutableInterval i)
{
if (_underlying == null) {
return i;
}
// Find the oblique segment which includes the interval start or the
// next segment to the right if the region which includes the start
// has been deleted.
int start = -1;
AbstractDataSegment seg = _first;
for (; seg != null; seg = seg.getNext()) {
if (seg.isAnchor()) {
continue;
}
final Interval ulpos = resolve(new ImmutableInterval(seg.getStart(), seg.getEnd()));
final int ulend = ulpos.getEnd();
if (ulend <= i.getStart()) {
// If the end position of the current segment in the underlying
// data is left of our seek position, go directly to the next
// one.
continue;
}
final int ulstart = ulpos.getStart();
if ((ulstart <= i.getStart()) && (i.getStart() < ulend)) {
// So the seek pos is within this interval. Calculate offset
// and from that the position in AD.
start = seg.getStart() + (i.getStart() - ulstart);
break;
}
// At this point we have found all segments left of the start.
// If the current segment does not contain the start, then we need
// to return the start position of the next segment we
// encounter.
if (ulstart >= i.getEnd()) {
seg = seg.getNext();
}
break;
}
if (start == -1) {
if (seg != null) {
start = seg.getStart();
}
else {
// If there is nothing more, return the end.
start = _last.getPosition();
}
}
// Now we search for the end. We leave seg as is, no need to seek again.
int end = -1;
ObliqueSegment last = null;
for (; seg != null; seg = seg.getNext()) {
if (seg instanceof ObliqueSegment) {
final ObliqueSegment oseg = (ObliqueSegment) seg;
final int ulend = oseg._end.getPosition();
if (ulend <= i.getEnd()) {
// If the end position of the current segment in the underlying
// data is left of our seek position, go directly to the next
// one.
last = oseg;
continue;
}
final int ulstart = oseg._start.getPosition();
if ((ulstart <= i.getEnd()) && (i.getEnd() < ulend)) {
// So the seek pos is within this interval. Calculate offset
// and from that the position in AD.
end = seg.getStart() + (i.getEnd() - ulstart);
break;
}
// Processed everything left of the seek position. Bail out.
break;
}
// If the current segment is not an ObliqueSegment, we cannot
// determine its position in the underlying data. Go on to find
// an ObliqueSegment.
}
// At this point we have found all segments left of the seek pos
// but no segment containing it. In that case we need to return
// the end position of the last segment we encountered.
if (end == -1) {
if (last != null) {
end = last.getEnd();
}
else {
// If we found nothing, return start as end (empty interval)
end = start;
}
}
return new ImmutableInterval(start, end);
}
/**
* Get data segment currently at the given position. Anchors are never returned.
*
* @param position
* a position.
* @return the non-anchor segment at the given position.
*/
public AbstractDataSegment getSegmentAt(final int position)
{
return getSegmentAt(position, false);
}
/**
* Fetch the segment that includes the designated offset position. The segment is searched from
* left to right. If the position is at the the start of a segment and the parameter
* {@code includeAnchors} is {@code true}, then the anchor is returned instead of the segment or
* if the position is right one beyond the end of the data, then the end boundary anchor is
* returned.
*
* @param position
* the offset.
* @param includeAnchors
* whether or not to include anchors.
*/
private AbstractDataSegment getSegmentAt(final int position, final boolean includeAnchors)
{
if (position < 0) {
throw new IndexOutOfBoundsException("Negative position not allowed: [" + position + "]");
}
final AbstractDataSegment first = _first;
if (first == null) {
throw new IndexOutOfBoundsException("No data");
}
AbstractDataSegment seg = first._next;
int pEnd = seg.length();
while ((seg != null) && (position > (pEnd - 1))) {
seg = seg.getNext();
if (seg != null) {
pEnd += seg.length();
}
}
if (seg == null) {
if ((includeAnchors) && (pEnd == position)) {
return _last;
}
else {
throw new IndexOutOfBoundsException("Index [" + position + "] not in range [0-"
+ pEnd + "], [" + (position - pEnd) + "] off");
}
}
// If we can directly hit an anchor, return the anchor
if (includeAnchors) {
if (seg._prev.isAnchor() && (((Anchor) seg._prev).getPosition() == position)) {
return seg._prev;
}
}
return seg;
}
public AbstractDataSegment getFirst()
{
if (_first._next != _last) {
return _first._next;
}
else {
return null;
}
}
public AbstractDataSegment getLast()
{
if (_last._prev != _first) {
return _last._prev;
}
else {
return null;
}
}
/**
* Get an iterator over the internal data segments.
*/
@Override
public Iterator<DataSegment> iterator()
{
return new DataSegmentIterator((AbstractDataSegment) getFirst());
}
/**
* Gets total length of data.
*
* @return length of the data.
*/
public int length()
{
int length = 0;
for (final DataSegment s : this) {
length += s.length();
}
return length;
}
/**
* Fetch data
*
* @return the data.
*/
public String get()
{
if (_stringDirty) {
// FIXME: inefficient!
final StringBuilder sb = new StringBuilder();
for (final DataSegment s : this) {
sb.append(s.get());
}
_content = sb.toString();
_stringDirty = false;
}
return _content;
}
public void updateCaches()
{
if (_underlying != null) {
_underlying.updateCaches();
}
get();
if (_startDirty) {
int length = 0;
AbstractDataSegment seg = _first;
while (seg != null) {
seg._cachedStart = length;
length += seg.length();
seg = seg._next;
}
_startDirty = false;
System.out.println("startDirty false");
}
}
/**
* Fetch data
*
* @param start
* the start offset.
* @param end
* the end offset.
* @return the data.
*/
public String get(final int start, final int end)
{
return get().substring(start, end);
}
/**
* Inserts s at given position.
*
* @param pos
* position at which to insert.
* @param s
* string to insert.
*/
public void insert(final int pos, final String s)
{
if (s.length() == 0) {
return;
}
// Split up segment.
final AbstractDataSegment prefix;
final AbstractDataSegment suffix;
if (pos == 0) {
// When inserting at position 0, it's clear where to insert / no splitting
prefix = _first;
suffix = _first._next;
}
else {
prefix = getSegmentAt(pos);
suffix = prefix.split(pos);
}
// Insert segment
final BaseSegment seg = new BaseSegment(prefix, suffix, s);
prefix._next = seg;
suffix._prev = seg;
// // Drop useless segments
// dropSuperflourous(prefix);
// dropSuperflourous(suffix);
fireChange();
}
/**
* If the given segment is a zero-length base segment, then it can be dropped. Zero-length
* oblique segments need to be retained because through them we can know e.g. if we extended a
* word. In the following example the empty oblique segments allows us to do an inverse resolve
* from "hyphen- ated" to "hyphenated".
*
* Underlying:
* (B:0[This is a hyphen]16)(A:16)(B:16[- ]18)(A:18)(B:18[ated]22)(A:22)(B:22[ sentence]31)
*
* Wrapping:
* (O:0[This is a hyphen]16)(B:16[ated]20)(O:20[]20)(O:20[ sentence]29)
*/
private void dropSuperflourous(final AbstractDataSegment seg)
{
if ((seg instanceof BaseSegment) &&
// !seg.isAnchor() &&
(seg.length() == 0)) {
seg._prev._next = seg._next;
seg._next._prev = seg._prev;
}
}
/**
* Deletes data.
*
* @param start
* the start offset.
* @param end
* the end offset+1.
*/
public void delete(final int start, final int end)
{
replace(start, end, null);
}
/**
* Replaces data.
*
* @param start
* the start offset.
* @param end
* the end offset+1.
* @param d the data used to replace the current data.
*/
public void replace(final int start, final int end, final String d)
{
if (start == end) {
insert(start, d);
return;
}
// if (_log.isDebugEnabled()) {
// _log.debug("pre delete("+start+","+end+") - ["+get(start,
// end)+"] - "+dataSegmentsToString());
// }
final AbstractDataSegment segAtStart = getSegmentAt(start);
final AbstractDataSegment segAtEnd = getSegmentAt(end - 1, end > 1);
AbstractDataSegment prefix;
AbstractDataSegment suffix;
if (segAtStart == segAtEnd) {
// simple case: start and end within same segment
prefix = segAtStart;
prefix.split(start);
suffix = prefix._next;
suffix = suffix.split(end);
if (d == null || d.length() == 0) {
prefix._next = suffix;
suffix._prev = prefix;
}
else {
final BaseSegment s = new BaseSegment(prefix, suffix, d);
prefix._next = s;
suffix._prev = s;
}
}
else {
if (d == null || d.length() == 0) {
AbstractDataSegment s = segAtStart;
while (s != segAtEnd) {
if (s.isAnchor()) {
throw new UnsupportedOperationException(
"Unable to replace text containing anchors.");
}
s = s._next;
}
}
// complicated case
// note: there may be anchors in the middle that we need to preserve
segAtStart.split(start);
prefix = segAtStart;
suffix = segAtEnd.split(end);
if (d == null || d.length() == 0) {
AbstractDataSegment s = prefix._next;
while (s != suffix) {
if (s.isAnchor()) {
// anchors need to be preserved
}
else {
// non-anchors need to be removed
s._prev._next = s._next;
s._next._prev = s._prev;
}
s = s._next;
}
}
else {
final BaseSegment s = new BaseSegment(prefix, suffix, d);
prefix._next = s;
suffix._prev = s;
}
}
// Drop useless segments
dropSuperflourous(prefix);
dropSuperflourous(suffix);
// if (_log.isDebugEnabled()) {
// _log.debug("post delete("+start+","+end+") - "+dataSegmentsToString());
// }
fireChange();
}
/**
* Get an anchor at the specified position. Breaks up the segment at the given point if
* necessary. If there already is an anchor, it is reused.
*
* @param pos
* a position.
* @return an anchor.
*/
public Anchor getAnchor(final int pos)
{
if (pos == 0) {
return _first;
}
// Split up segment
final AbstractDataSegment prefix = getSegmentAt(pos, true);
if (prefix.isAnchor()) {
return (Anchor) prefix;
}
else {
final AbstractDataSegment suffix = prefix.split(pos);
// Insert segment
final AnchorSegment seg = new AnchorSegment(prefix, suffix);
prefix._next = seg;
suffix._prev = seg;
// Drop useless segments
dropSuperflourous(prefix);
dropSuperflourous(suffix);
return seg;
}
}
/**
* Get all the anchors.
*
* @return all the anchors.
*/
public Collection<Anchor> getAnchors()
{
final ArrayList<Anchor> anchors = new ArrayList<Anchor>();
for (AbstractDataSegment s = _first; s != null; s = s._next) {
if (s.isAnchor()) {
anchors.add((Anchor) s);
}
}
anchors.trimToSize();
return anchors;
}
/**
* Get all the segments.
*
* @return all the segments.
*/
public Collection<DataSegment> getSegments()
{
final ArrayList<DataSegment> segments = new ArrayList<DataSegment>();
for (AbstractDataSegment s = _first; s != null; s = s._next) {
if (!s.isAnchor()) {
segments.add(s);
}
}
segments.trimToSize();
return segments;
}
/**
* Create a string representation of the segments. This is for debugging purposes.
*
* @return string representation of the segments.
*/
public String dataSegmentsToString()
{
final StringBuilder sb = new StringBuilder();
sb.append(">>");
for (final DataSegment s : this) {
sb.append(s.toString());
}
sb.append("<< ");
// Collection<Anchor> anchors = this.getAnchors();
// sb.append("A[");
// sb.append(anchors.size());
// sb.append("]{");
// for (Anchor a : anchors) {
// sb.append("(");
// sb.append(a.getPosition());
// sb.append(")");
// }
// sb.append("} ");
//
// Collection<DataSegment> segments = this.getSegments();
// sb.append("S[");
// sb.append(segments.size());
// sb.append("]{");
// for (DataSegment s : segments) {
// sb.append("(");
// sb.append(s.getClass().getSimpleName().charAt(0));
// sb.append(":");
// sb.append(s.getStart());
// sb.append("..");
// sb.append(s.getEnd());
// sb.append(")");
// }
// sb.append("} ");
AbstractDataSegment s = _first;
while (s != null) {
sb.append("(");
sb.append(s.getClass().getSimpleName().charAt(0));
sb.append(":");
sb.append(s.getStart());
if (!s.isAnchor()) {
sb.append("[" + s.get() + "]");
sb.append(s.getEnd());
}
sb.append(")");
s = s._next;
}
return sb.toString();
}
@Override
public String toString()
{
return dataSegmentsToString();
}
interface DataSegment
{
/**
* Fetch data
*/
String get();
/**
* Gets total length of data.
*/
int length();
DataSegment getPrevious();
DataSegment getNext();
int getStart();
int getEnd();
}
interface Anchor
{
/**
* Get the current position of the anchor.
*/
int getPosition();
DataSegment getNext();
}
/**
* Base class for the data segments
*
*/
abstract class AbstractDataSegment
implements AlignedString.DataSegment
{
protected AbstractDataSegment _prev;
protected AbstractDataSegment _next;
protected int _cachedStart = -1;
public AbstractDataSegment(final AbstractDataSegment prev, final AbstractDataSegment next)
{
_prev = prev;
_next = next;
}
public abstract AbstractDataSegment split(int position);
@Override
public DataSegment getPrevious()
{
AbstractDataSegment s = _prev;
// We skip the Anchors with have a zero length.
while ((s != null) && s.isAnchor()) {
s = s._prev;
}
return s;
}
@Override
public AbstractDataSegment getNext()
{
AbstractDataSegment s = _next;
// We skip the Anchors with have a zero length.
while ((s != null) && s.isAnchor()) {
s = s._next;
}
return s;
}
@Override
public int getStart()
{
if (_startDirty || _cachedStart == -1) {
int pos = 0;
AbstractDataSegment seg = this._prev;
while (seg != null) {
pos += seg.length();
seg = seg._prev;
}
return pos;
}
else {
return _cachedStart;
}
}
@Override
public int getEnd()
{
return getStart() + length();
}
/**
* True if the segment is virtual (not data relevant)
*/
public abstract boolean isAnchor();
}
/**
* A segment that is not contained in the underlying data
*
*/
class BaseSegment
extends AbstractDataSegment
{
private String _data;
public BaseSegment(final AbstractDataSegment prev, final AbstractDataSegment next,
final String data)
{
super(prev, next);
_data = data;
}
@Override
public String get()
{
return _data;
}
@Override
public int length()
{
return _data.length();
}
@Override
public AbstractDataSegment split(final int position)
{
// Calculate positions
final int pos = position - getStart();
// Create new segment
final BaseSegment suffix = new BaseSegment(this, _next, _data.substring(pos,
_data.length()));
// Change current segment
_data = _data.substring(0, pos);
// Insert new segment
_next._prev = suffix;
_next = suffix;
return suffix;
}
@Override
public boolean isAnchor()
{
return false;
}
@Override
public String toString()
{
return "{" + _data + "}";
}
}
/**
* A data segment that accesses the underlying data.
*
*/
class ObliqueSegment
extends AbstractDataSegment
{
private final Anchor _start;
private Anchor _end;
public ObliqueSegment(final AbstractDataSegment prev, final AbstractDataSegment next,
final Anchor start, final Anchor end)
{
super(prev, next);
_start = start;
_end = end;
}
@Override
public String get()
{
final StringBuilder sb = new StringBuilder();
for (DataSegment s = _start.getNext(); s != _end.getNext(); s = s.getNext()) {
sb.append(s.get());
}
return sb.toString();
}
@Override
public int length()
{
int length = 0;
for (DataSegment s = _start.getNext(); s != _end.getNext(); s = s.getNext()) {
length += s.length();
}
return length;
}
@Override
public AbstractDataSegment split(final int position)
{
// Calculate positions and get anchor
final int pos = position - getStart();
final Anchor splitAnchor = _underlying.getAnchor(_start.getPosition() + pos);
// Create new segment
final ObliqueSegment suffix = new ObliqueSegment(this, _next, splitAnchor, _end);
// Change current segment
_end = splitAnchor;
// Insert new segment
_next._prev = suffix;
_next = suffix;
return suffix;
}
@Override
public boolean isAnchor()
{
return false;
}
@Override
public String toString()
{
return "[" + get() + "]";
}
}
/**
* Segment serving as an anchor for higher level data.
*
*/
class AnchorSegment
extends AbstractDataSegment
implements AlignedString.Anchor
{
public AnchorSegment(final AbstractDataSegment prev, final AbstractDataSegment next)
{
super(prev, next);
}
@Override
public String get()
{
return "";
}
@Override
public int length()
{
return 0;
}
@Override
public AbstractDataSegment split(final int position)
{
// Normally anchors are unsplittable (they have a width of zero),
// but an attempt to split an anchor will return the anchor itself
// if the split position is exactly the anchor position.
if (position == getPosition()) {
return this;
}
else {
throw new IndexOutOfBoundsException("Split position [" + position
+ "] does not match anchor position [" + getPosition() + "]");
}
}
@Override
public boolean isAnchor()
{
return true;
}
@Override
public int getPosition()
{
return getStart();
}
@Override
public String toString()
{
final StringBuilder sb = new StringBuilder();
final DataSegment prev = getPrevious();
final DataSegment next = getNext();
if (prev != null) {
sb.append(prev.get() + "<");
}
sb.append(getPosition());
if (next != null) {
sb.append(">" + next.get());
}
return sb.toString();
}
}
}
/**
* DataSegment iterator.
*
*/
class DataSegmentIterator
implements Iterator<AlignedString.DataSegment>
{
private final boolean _includeAll;
private AlignedString.AbstractDataSegment _next = null;
public DataSegmentIterator(final AlignedString.AbstractDataSegment first)
{
_next = first;
_includeAll = false;
}
public DataSegmentIterator(final AlignedString.AbstractDataSegment first,
final boolean includeAll)
{
_next = first;
_includeAll = includeAll;
}
@Override
public boolean hasNext()
{
return _next != null;
}
@Override
public AlignedString.DataSegment next()
{
final AlignedString.DataSegment result = _next;
if (_includeAll) {
_next = _next._next;
}
else {
_next = _next.getNext();
}
return result;
}
@Override
public void remove()
{
throw new UnsupportedOperationException();
}
}