/**
* Copyright 2000-2009 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.fst;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
*
* This is a particular Trie whose Symbols are Pairs of Strings, the first of which is interpreted as an input symbol and the
* second as an output symbol. The transducer obtained by trie minimization can be written to a file in the transducer format used
* by MARY.
*
* To get the transducer representation first compute the minimization of the trie, then write the transducer to disk.
*
* See main method for example usage.
*
* @author benjaminroth
*
*/
public class TransducerTrie extends Trie<StringPair> {
static int ARCOFFSET_BITS = 20;
static int OVERALL_BITS = 32;// for example
static int LABELID_BITS = OVERALL_BITS - (ARCOFFSET_BITS + 1);
public void writeFST(DataOutputStream out, String encoding) throws IOException {
if (null == this.reprs)
throw new IllegalStateException("Cannot write transducer: first compute minimization of trie.");
// compute arc offsets
int[] arcOffsets = new int[this.reprs.size() + 1];
// first has offset one (consider additional start arc)
arcOffsets[0] = 1;
for (int i = 0; i < rlist.size(); i++) {
arcOffsets[i + 1] = arcOffsets[i] + rlist.get(i).getArcMap().size();
// if final, consider the added "final arc"
if (rlist.get(i).isFinal) {
arcOffsets[i + 1] += 1;
}
}
// write number of arcs
int maxAO = arcOffsets[arcOffsets.length - 1];
// to ensure that number can be encoded:
// shift to right by the number of available bits and look if something remains
if ((maxAO >> ARCOFFSET_BITS) != 0) {
int numBitsNeeded = (int) Math.ceil(Math.log(maxAO) / Math.log(2));
throw new IOException("Cannot write transducer: too many arcs to be encoded in binary fst format (would need "
+ numBitsNeeded + " bits, have " + ARCOFFSET_BITS + ")");
}
int maxLID = this.labels.size() + 2;
if ((maxLID >> LABELID_BITS) != 0) {
int numBitsNeeded = (int) Math.ceil(Math.log(maxLID) / Math.log(2));
throw new IOException("Cannot write transducer: too many arc-labels to be encoded in binary fst format (would need "
+ numBitsNeeded + " bits, have " + LABELID_BITS + ")");
}
if (!Charset.isSupported(encoding))
throw new IOException("Cannot write transducer: encoding not supported.");
// write encoding in UTF-8
out.writeInt(encoding.length());
out.write(encoding.getBytes("UTF-8"));
// write overall bits
out.writeInt(OVERALL_BITS);
// write bits used for encoding arc_offsets
out.writeInt(ARCOFFSET_BITS);
out.writeInt(maxAO);
// write starting arc:
// pointing to start node offset - empty label - final
int startArc = arcOffsets[root.getId()] | 1 << 20 | 1 << 31;
out.writeInt(startArc);
// write arcs, final nodes have final arc as last with empty label
// dont forget to add one
for (TrieNode repr : rlist) {
List<Integer> arcVals = new ArrayList<Integer>();
for (Integer labelId : repr.getArcMap().keySet()) {
int targetId = repr.getArcMap().get(labelId).getId();
arcVals.add(arcOffsets[targetId] | (labelId + 2) << 20);
}
// if final, consider the added "final arc"
if (repr.isFinal) {
arcVals.add(arcOffsets[repr.getId()] | 0 << 20 | 1 << 31);
} else {
// mark last of the arcs as "last"
int last = arcVals.size() - 1;
arcVals.set(last, arcVals.get(last) | 1 << 31);
}
for (Integer val : arcVals) {
out.writeInt(val);
}
}
// compute label offsets
int[] labelOffsets = new int[this.labels.size() * 2];
// first has offset two (input and output of empty arc)
labelOffsets[0] = 4;
for (int i = 0; i < labels.size(); i++) {
StringPair ioSym = labels.get(i);
// offset of outS determined by offset of inS
labelOffsets[i * 2 + 1] = labelOffsets[i * 2];
// offset increased by length of inS
labelOffsets[i * 2 + 1] += ioSym.getString1().getBytes(encoding).length;
// additionally increased by one because of stop byte
labelOffsets[i * 2 + 1] += 1;
if (i + 1 < labels.size()) {
// offset of next inS determined by this outS
labelOffsets[(i + 1) * 2] = labelOffsets[i * 2 + 1];
labelOffsets[(i + 1) * 2] += ioSym.getString2().getBytes(encoding).length;
labelOffsets[(i + 1) * 2] += 1;
}
}
// write number of pairs
out.writeInt(labels.size() + 2);
// write empty label id/offset
out.writeShort(0);
out.writeShort(1);
out.writeShort(2);
out.writeShort(3);
// write pair offsets
for (int i = 0; i < labels.size(); i++) {
out.writeShort(labelOffsets[i * 2]);
out.writeShort(labelOffsets[i * 2 + 1]);
}
// write first two pairs: just empty symbols
out.writeByte(0);
out.writeByte(0);
out.writeByte(0);
out.writeByte(0);
// write pairs
for (int i = 0; i < labels.size(); i++) {
StringPair ioSym = labels.get(i);
out.write(ioSym.getString1().getBytes(encoding));
out.writeByte(0);
out.write(ioSym.getString2().getBytes(encoding));
out.writeByte(0);
}
}
public static void main(String[] args) throws IOException {
// example usage
String path = "/Users/benjaminroth/Desktop/mary/fst/german/";
// specify location of lexicon you want to encode
BufferedReader lexReader = new BufferedReader(new InputStreamReader(new FileInputStream(path + "lexicon.txt"),
"ISO-8859-1"));
// specify location of output
String fstLocation = path + "lexicon.fst";
// initialize trainer
// AlignerTrainer at = new AlignerTrainer(PhonemeSet.getPhonemeSet(phFileLoc), Locale.ENGLISH);
AlignerTrainer at = new AlignerTrainer(false, true);
System.out.println("reading lexicon...");
// read lexicon for training
at.readLexicon(lexReader, "\\\\");
System.out.println("...done!");
System.out.println("aligning...");
long start = System.currentTimeMillis();
// make some alignment iterations
for (int i = 0; i < 4; i++) {
System.out.println(" iteration " + (i + 1));
at.alignIteration();
}
long time = System.currentTimeMillis() - start;
System.out.println("...done!");
System.out.println("alignment took " + time + "ms");
TransducerTrie t = new TransducerTrie();
System.out.println("entering alignments in trie...");
for (int i = 0; i < at.lexiconSize(); i++) {
t.add(at.getAlignment(i));
t.add(at.getInfoAlignment(i));
}
System.out.println("...done!");
System.out.println("minimizing trie...");
t.computeMinimization();
System.out.println("...done!");
System.out.println("writing transducer to disk...");
File of = new File(fstLocation);
DataOutputStream os = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(of)));
t.writeFST(os, "UTF-8");
os.flush();
os.close();
System.out.println("...done!");
System.out.println("looking up test words...");
FSTLookup fst = new FSTLookup(fstLocation);
System.out.println(" Fahrrad -> " + Arrays.toString(fst.lookup("Fahrrad")));
System.out.println(" fahren -> " + Arrays.toString(fst.lookup("fahren")));
System.out.println(" Umwelt -> " + Arrays.toString(fst.lookup("Umwelt")));
System.out.println(" schonen -> " + Arrays.toString(fst.lookup("schonen")));
System.out.println(" abgerechnet -> " + Arrays.toString(fst.lookup("abgerechnet")));
System.out.println(" abgerechnet(A) -> " + Arrays.toString(fst.lookup("abgerechnet(A)")));
System.out.println(" absorbieren -> " + Arrays.toString(fst.lookup("absorbieren")));
System.out.println(" absorbieren(WV1b) -> " + Arrays.toString(fst.lookup("absorbieren(WV1b)")));
System.out.println(" übersetzen -> " + Arrays.toString(fst.lookup("übersetzen")));
System.out.println("...done!");
}
}