/** * Copyright 2003-2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.fst; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.ArrayList; /** * An implementation of a finite state transducer. This class does nothing but load and represent the FST. It is used by other * classes doing something reasonable with it. * * @author Andreas Eisele */ public class FST { // The following variables are package-readable, so that they can be // directly accessed by all classes in this package. int[] targets; short[] labels; boolean[] isLast; short[] offsets; byte[] bytes; int[] mapping; ArrayList strings = new ArrayList(); public FST(String fileName) throws IOException { FileInputStream fis = new FileInputStream(fileName); try { load(fis); } finally { fis.close(); } } /** * Load the fst from the given input stream. Assumes header. * * @param inStream * in stream * @throws IOException * IOException */ public FST(InputStream inStream) throws IOException { load(inStream); } /** * Initialise the finite state transducer. Loads from headerless legacy file format. * * @param fileName * the name of the file from which to load the FST. * @param encoding * the name of the encoding used in the file (e.g., UTF-8 or ISO-8859-1). * @throws IOException * if the FST cannot be loaded from the given file. * @throws UnsupportedEncodingException * if the encoding is not supported. */ public FST(String fileName, String encoding) throws IOException, UnsupportedEncodingException { this(fileName, encoding, false); } /** * Initialise the finite state transducer. This constructor will assume that the file uses the system default encoding. * * @param fileName * the name of the file from which to load the FST. * @param verbose * whether to write a report to stderr after loading. * @throws IOException * if the FST cannot be loaded from the given file. */ public FST(String fileName, boolean verbose) throws IOException { this(fileName, null, verbose); } /** * Initialise the finite state transducer. * * @param fileName * the name of the file from which to load the FST. * @param encoding * the name of the encoding used in the file (e.g., UTF-8 or ISO-8859-1). * * This constructor is to be used for old FST-files where the encoding was not yet specified in the header. * * @param verbose * whether to write a report to stderr after loading. * @throws IOException * if the FST cannot be loaded from the given file. * @throws UnsupportedEncodingException * if the encoding is not supported. */ public FST(String fileName, String encoding, boolean verbose) throws IOException, UnsupportedEncodingException { FileInputStream fis = new FileInputStream(fileName); try { loadHeaderless(fis, encoding, verbose); } finally { fis.close(); } } /** * Load the fst from the given input stream. Assumes headerless legacy file format. * * @param inStream * inStream * @param encoding * encoding * @throws IOException * IOException * @throws UnsupportedEncodingException * UnsupportedEncodingException */ public FST(InputStream inStream, String encoding) throws IOException, UnsupportedEncodingException { loadHeaderless(inStream, encoding, false); } private void load(InputStream inStream) throws IOException, UnsupportedEncodingException { int i; DataInputStream in = new DataInputStream(new BufferedInputStream(inStream)); // int fileSize= (int) f.length(); int fileSize = in.available(); // TODO: how robust is this?? int encLen = in.readInt(); byte[] encBytes = new byte[encLen]; in.read(encBytes, 0, encLen); String encoding = new String(encBytes, "UTF-8"); if (!Charset.isSupported(encoding)) throw new IOException("Encoding of FST file not correctly specified. Maybe file in old format."); int overallBits = in.readInt(); int arcOffBits = in.readInt(); // System.out.println("bits: " + overallBits + "-" + arcOffBits); // todo: allow for more flexibility if (overallBits != 32 || arcOffBits != 20) { throw new IOException("Cannot handle non-standard bit allocation for label and arc id's."); } int nArcs = in.readInt(); // arcs = new int[nArcs]; targets = new int[nArcs]; labels = new short[nArcs]; isLast = new boolean[nArcs]; for (i = 0; i < nArcs; i++) { int thisArc = in.readInt(); targets[i] = thisArc & 1048575; labels[i] = (short) ((thisArc >> 20) & 2047); isLast[i] = ((byte) (thisArc >> 31)) != 0; } int nPairs = in.readInt(); offsets = new short[2 * nPairs]; for (i = 0; i < 2 * nPairs; i++) offsets[i] = in.readShort(); // int nBytes = fileSize - 8 - 4 * (nPairs + nArcs); int nBytes = fileSize - 20 - encLen - 4 * (nPairs + nArcs); mapping = new int[nBytes]; bytes = new byte[nBytes]; in.readFully(bytes); assert in.available() == 0 : "Partial file read... not good"; in.close(); createMapping(mapping, bytes, encoding); } private void loadHeaderless(InputStream inStream, String encoding, boolean verbose) throws IOException, UnsupportedEncodingException { int i; DataInputStream in = new DataInputStream(new BufferedInputStream(inStream)); // int fileSize= (int) f.length(); int fileSize = in.available(); // TODO: how robust is this?? int nArcs = in.readInt(); // arcs = new int[nArcs]; targets = new int[nArcs]; labels = new short[nArcs]; isLast = new boolean[nArcs]; for (i = 0; i < nArcs; i++) { int thisArc = in.readInt(); targets[i] = thisArc & 1048575; labels[i] = (short) ((thisArc >> 20) & 2047); isLast[i] = ((byte) (thisArc >> 31)) != 0; } int nPairs = in.readInt(); offsets = new short[2 * nPairs]; for (i = 0; i < 2 * nPairs; i++) offsets[i] = in.readShort(); int nBytes = fileSize - 8 - 4 * (nPairs + nArcs); mapping = new int[nBytes]; bytes = new byte[nBytes]; in.readFully(bytes); if (verbose) { System.err.println("FST (" + fileSize + " Bytes, " + nArcs + " Arcs, " + nPairs + " Labels)" + " loaded"); } in.close(); createMapping(mapping, bytes, encoding); } private void createMapping(int[] mapping, byte[] bytes, String encoding) throws UnsupportedEncodingException { mapping[0] = 0; int last0 = -1; String s; int len; for (int i = 0; i < bytes.length; i++) { if (bytes[i] == 0) { len = i - last0 - 1; if (len == 0) strings.add(""); else { String str; if (encoding != null) str = new String(bytes, last0 + 1, len, encoding); else str = new String(bytes, last0 + 1, len); strings.add(str); } mapping[last0 + 1] = strings.size() - 1; last0 = i; } } } }