package edu.umd.hooka;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import java.nio.*;
import java.util.TreeSet;
/**
* Represents a string of one or more words that may be a word,
* phrase, sentence, or unit larger than a sentence.
*
* @author chris
*/
public class Phrase implements WritableComparable, Cloneable {
byte _language;
int[] _words;
public Object clone() {
Phrase res = new Phrase();
res._language = _language;
if (_words != null) {
res._words = _words.clone();
}
return res;
}
public Phrase() {}
public Phrase(int[] p, int lang) {
_language = (byte)lang;
_words = p;
}
public int size() {
if (_words==null) return 0; else return _words.length;
}
public boolean equals(Object o) {
if (!(o instanceof Phrase))
return false;
Phrase that=(Phrase)o;
if (this._language != that._language)
return false;
if (that._words.length != this._words.length)
return false;
return java.util.Arrays.equals(this._words, that._words);
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("[L=").append(_language);
if (_words != null) {
for (int i=0; i<_words.length; ++i) {
sb.append(' ').append(_words[i]);
}
}
sb.append(']');
return sb.toString();
}
public int compareTo(Object o)
{
Phrase that = (Phrase)o;
if (_language != that._language) {
return (int)that._language - (int)this._language;
}
if (this._words == null || that._words == null) {
int a = 0; if (_words!=null) a = _words.length;
int b = 0; if (that._words!=null) b = that._words.length;
return b - a;
}
for (int i=0; i < _words.length && i < that._words.length; i++) {
int a = _words[i];
int b = that._words[i];
if (a != b) return a - b;
}
return _words.length - that._words.length;
}
public int hashCode() {
int hc = (int)_language + 73;
if (_words != null) {
for (int i = 0; i < _words.length; i++) {
hc = (31 * hc) + _words[i];
}
}
return hc;
}
public String toString(Vocab voc) {
StringBuffer sb = new StringBuffer();
if (_words != null) {
for (int i=0; i<_words.length; ++i) {
if (i != 0) sb.append(' ');
sb.append(voc.get(_words[i]));
}
}
return sb.toString();
}
public int[] getWords() { return _words; }
public TreeSet<Integer> getWordsWithoutDuplicates()
{
TreeSet<Integer> vals = new TreeSet<Integer>();
for(int i=0; i<_words.length; i++ ) {
vals.add(new Integer(_words[i]));
}
return vals;
}
public byte getLanguage() { return _language; }
public void setLanguage(int l) { _language = (byte)l; }
public Phrase getSubPhrase(int start, int end)
{
Phrase res = new Phrase();
res._language = _language;
res._words = new int[end-start+1];
System.arraycopy(_words, start, res._words, 0, end-start+1);
return res;
}
public static Phrase fromString(int lang, String sentence, Vocab voc)
{
Phrase s = new Phrase();
s._language = (byte)lang;
String[] w=sentence.split("\\s+");
s._words = new int[w.length];
for (int i=0; i<w.length; i++) {
s._words[i] = voc.addOrGet(w[i]);
}
return s;
}
public void readFields(DataInput in) throws IOException {
_language = in.readByte();
int bbLen = in.readInt();
if (bbLen == 0) { _words = null; return; }
ByteBuffer bb=ByteBuffer.allocate(bbLen);
in.readFully(bb.array());
IntBuffer ib = bb.asIntBuffer();
_words = new int[bbLen/4];
ib.get(_words);
}
public void setWords(int[] words) {
this._words = words;
}
public void write(DataOutput out) throws IOException {
out.writeByte(_language);
int bbLen = 0;
if (_words != null) { bbLen = _words.length * 4; }
out.writeInt(bbLen);
if (bbLen == 0) { return; }
ByteBuffer bb=ByteBuffer.allocate(bbLen);
IntBuffer ib = bb.asIntBuffer();
ib.put(_words);
out.write(bb.array());
}
}