/* Copyright (c) 2009-2011 Speech Group at Informatik 5, Univ. Erlangen-Nuremberg, GERMANY Korbinian Riedhammer Tobias Bocklet This file is part of the Java Speech Toolkit (JSTK). The JSTK is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. The JSTK is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with the JSTK. If not, see <http://www.gnu.org/licenses/>. */ package de.fau.cs.jstk.arch; import java.io.IOException; /** * A Tokenization is the word and the respective token sequence. * * @author sikoried */ public final class Tokenization implements Comparable <Tokenization> { private static final String [] nullseq = new String [0]; /** The actual word */ public String word; /** The token sequence using tokens */ public String [] sequence; /** * Generate a phony Tokenization with only the word but no actual * tokenization. This can be used to search within the Tokenizer * @param word */ public Tokenization(String word) { this.word = word; this.sequence = nullseq; } /** * Create a new lexicon entry using given word and transcription * @param word * @param transcription */ public Tokenization(String word, String [] sequence) { this.word = word; this.sequence = sequence; } /** * Create a new lexicon entry using the given line containing something * like "word tok1 [tok2 ...]". * @param line */ public Tokenization(String line, Alphabet alphabet) throws IOException { String [] split = line.trim().split("\\s+"); this.word = split[0]; this.sequence = new String [split.length-1]; for (int i = 1; i < split.length; ++i) { if (!alphabet.isValidToken(split[i])) throw new IOException("Unknown token '" + split[i] + "'"); sequence[i-1] = split[i]; } } /** * lexical sort */ public int compareTo(Tokenization e) { return word.compareTo(e.word); } /** * Two Tokenizations are equal if the respective word is the same. */ public boolean equals(Object o) { if (o instanceof Tokenization) return ((Tokenization) o).word.equals(word); return false; } public String toString() { StringBuffer sb = new StringBuffer(); sb.append(word); for (String t : sequence) sb.append(" " + t); return sb.toString(); } public int hashCode() { return word.hashCode(); } }