package no.trank.openpipe.lemmatizer.util;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2007 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
public class TernarySearchTree<V extends TreeValue> {
private static final byte VERSION = (byte) 1;
private static final byte[] HEADER = new byte[]{(byte) 'o', (byte) '|', VERSION};
/**
* The root of the tree.
*/
private Node<V> root;
/**
* The number of nodes in the tree.
*/
private int size;
private final TreeValueFactory<V> factory;
private static final int LEN_ZK = 1;
private static final int CHAR_ZK = 3;
/**
* Creates a new empty ternary search tree.
*/
public TernarySearchTree(TreeValueFactory<V> factory) {
if (factory == null) {
throw new NullPointerException("TreeValueFactory cannot be null");
}
this.factory = factory;
}
/**
* Creates a new empty ternary search tree and populates it with a given collection of character sequences.
*
* @param c a collection of character sequences.
*/
public TernarySearchTree(final Iterable<TreeEntry<V>> c, TreeValueFactory<V> factory) {
this(factory);
for (final TreeEntry<V> entry : c) {
put(entry.getKey(), entry.getValue());
}
}
public V get(final CharSequence s) {
final int l = s.length();
Node<V> e = root;
int offset = 0;
while (e != null) {
final char[] path = e.path;
int i = 0;
for (; i < path.length - 1; i++) {
if (offset + i == l || s.charAt(offset + i) != path[i]) {
return null;
}
}
offset += i;
if (offset == l) {
return null;
}
final char c = s.charAt(offset);
if (c < e.path[i]) {
e = e.left;
} else if (c > e.path[i]) {
e = e.right;
} else {
offset++;
if (offset == l) {
return e.value;
}
e = e.middle;
}
}
return null;
}
public boolean contains(CharSequence s) {
return get(s) != null;
}
/**
* True if the last {@link #add(CharSequence)} modified the tree.
*/
private boolean modified;
public boolean put(final CharSequence s, final V value) {
modified = false;
root = addRec(s, 0, s.length(), root, value);
return modified;
}
/**
* Inserts the given character sequence, starting at the given position, in the given subtree.
*
* @param s the character sequence containing the characters to be inserted.
* @param offset the first character to be inserted.
* @param length the number of characters to be inserted.
* @param e the subtree in which the characters should be inserted, or <code>null</code> if
* a new node should be created.
* @return the new node at the top of the subtree.
*/
private Node<V> addRec(final CharSequence s, final int offset, final int length, final Node<V> e, final V value) {
if (e == null) {
// We create a new node containing all the characters and return it.
modified = true;
size++;
return new Node<V>(s, offset, length, value);
}
/* We start scanning the path contained in the current node, up to
* the last character excluded. If we find a mismatch, or if we exhaust our
* characters, we must fork this node. */
int i;
Node<V> n = null;
final char[] path = e.path;
for (i = 0; i < path.length - 1; i++) {
final char c = s.charAt(offset + i);
if (c < path[i]) {
/* We fork on the left, keeping just the first i + 1 characters (this is necessary
* as at least one character must be present in every node). The new
* node will cover one word more than e.
*/
n = new Node<V>(path, 0, i + 1, null);
n.middle = e;
e.removePathPrefix(i + 1);
n.left = addRec(s, offset + i, length - i, null, value);
break;
} else if (c > path[i]) {
// As before, but on the right.
n = new Node<V>(path, 0, i + 1, null);
n.middle = e;
e.removePathPrefix(i + 1);
n.right = addRec(s, offset + i, length - i, null, value);
break;
} else {
if (i == length - 1) {
/* We exhausted the character sequence. We fork in the middle,
* keeping length characters and marking the new node as
* containing one work. Again, the new code will cover one word
* more than e. */
n = new Node<V>(s, offset, length, value);
n.middle = e;
e.removePathPrefix(length);
size++;
modified = true;
break;
}
}
}
if (i < path.length - 1) {
return n;
}
/* We are positioned on the last character of the path. In this case our
* behaviour is different, as if we must fork we must not perform any
* splitting. Moreover, if we exhaust the characters we either found
* the new sequence in the tree, or we just have to mark the node. */
final char c = s.charAt(offset + i);
if (c < path[i]) {
/** We fork on the left. The number of words under this node will
* increase only if the structure is modified. */
e.left = addRec(s, offset + i, length - i, e.left, value);
} else if (c > path[i]) {
e.right = addRec(s, offset + i, length - i, e.right, value);
} else {
if (i == length - 1) {
// This is the node.
if (modified = e.value != null) {
size++;
}
e.value = value;
} else {
// We add a node in the middle, completing the sequence.
e.middle = addRec(s, offset + i + 1, length - i - 1, e.middle, value);
}
}
return e;
}
public int size() {
return size;
}
public void read(InputStream in) throws IOException {
readHeader(in);
final InputBitStream inB = new InputBitStream(in);
final long serialVersionUID = inB.readLongNibble();
if (serialVersionUID != factory.getSerialVersionUID()) {
throw new IOException("serialVersionUID missmatch, read " + serialVersionUID + " exptected " +
factory.getSerialVersionUID());
}
size = inB.readNibble();
factory.readHeader(inB);
root = readNode(inB);
}
private Node<V> readNode(InputBitStream in) throws IOException {
final int len = in.readZeta(LEN_ZK);
if (len > 0) {
final char[] path = new char[len];
for (int i = 0; i < len; i++) {
path[i] = (char) (in.readZeta(CHAR_ZK) + '0');
}
final V value;
if (in.readBit() == 1) {
value = factory.newValue();
value.read(in);
} else {
value = null;
}
final Node<V> node = new Node<V>(path, value);
node.left = readNode(in);
node.middle = readNode(in);
node.right = readNode(in);
return node;
}
return null;
}
private static void readHeader(InputStream in) throws IOException {
final byte[] buf = new byte[HEADER.length];
final int len = in.read(buf);
if (len != buf.length) {
throw new IOException("Could not read header from stream, got " + len + " bytes expected " + HEADER.length);
} else if (!Arrays.equals(HEADER, buf)) {
throw new IOException("Could not read header from stream, got " + Arrays.toString(buf) + " expected " +
Arrays.toString(HEADER));
}
}
public void write(OutputStream out) throws IOException {
out.write(HEADER);
final OutputBitStream outB = new OutputBitStream(out);
try {
outB.writeLongNibble(factory.getSerialVersionUID());
outB.writeNibble(size);
factory.writeHeader(outB);
writeNode(root, outB);
} finally {
outB.flush();
}
}
private void writeNode(final Node<V> node, final OutputBitStream out) throws IOException {
if (node == null) {
out.writeZeta(0, LEN_ZK);
} else {
final char[] path = node.path;
final int len = path.length;
out.writeZeta(len, LEN_ZK);
for (int i = 0; i < len; i++) {
out.writeZeta(path[i] - '0', CHAR_ZK);
}
final boolean hasValue = node.value != null;
out.writeBit(hasValue);
if (hasValue) {
node.value.write(out);
}
writeNode(node.left, out);
writeNode(node.middle, out);
writeNode(node.right, out);
}
}
/**
* A node of the tree.
*/
private static final class Node<V extends TreeValue> {
/**
* A pointer to the left subtree.
*/
private Node<V> left;
/**
* A pointer to the middle subtree.
*/
private Node<V> middle;
/**
* A pointer to the right subtree.
*/
private Node<V> right;
/**
* The nonempty path compressed at this node.
*/
private char[] path;
/**
* Whether this node represents a word.
*/
private V value;
/**
* Creates a new node containing a path specified by a character-sequence fragment.
*
* @param s a character sequence contaning the path of the node.
* @param offset the starting character of the path.
* @param length the length of the path.
* @param value the value of this node.
*/
public Node(final CharSequence s, final int offset, final int length, final V value) {
this.value = value;
path = new char[length];
MutableString.getChars(s, offset, offset + length, path, 0);
}
/**
* Creates a new node containing a path specified by a character-array fragment.
*
* @param a a character array contaning the path of the node.
* @param offset the starting character of the path.
* @param length the length of the path.
* @param value the value of this node.
*/
public Node(final char[] a, final int offset, final int length, final V value) {
this.value = value;
path = new char[length];
System.arraycopy(a, offset, path, 0, length);
}
/**
* Creates a new node containing a path specified by a character-array fragment.
*
* @param path a character array contaning the path of the node.
* @param value the value of this node.
*/
public Node(char[] path, V value) {
this.path = path;
this.value = value;
}
/**
* Removes a prefix from the path of this node.
*
* @param length the length of the prefix to be removed
*/
public void removePathPrefix(final int length) {
final char[] a = new char[path.length - length];
System.arraycopy(path, length, a, 0, a.length);
path = a;
}
}
}