/** * * Copyright 2014 Universität Hamburg. * Portions Copyright 1999-2012 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */ package edu.cmu.sphinx.fst.sequitur; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.Unmarshaller; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlMixed; import javax.xml.bind.annotation.XmlRootElement; import edu.cmu.sphinx.fst.Fst; import edu.cmu.sphinx.fst.semiring.Semiring; import edu.cmu.sphinx.fst.semiring.TropicalSemiring; /** * Converter for an Fst in Sequitur G2P's XML to Sphinx binary OpenFst format. * * Sequitur G2P (http://www-i6.informatik.rwth-aachen.de/web/Software/g2p.html) * provides easy-to-build G2P training facilities. Its binary models can be * converted to an XML FSA-format using fsa.py which is provided with Sequitur. * * This program reads the XML and constructs a @link{edu.cmu.sphinx.fst.Fst}, * which is then serialized into the Sphinx binary OpenFst format (but could * also be used directly). * * NOTICE: Sequitur's fsa.py does not in all cases construct valid XML, * specifically it fails to encode XML character entities &, <, and > * if these were part of the training material. If in doubt, please check for * and replace them in the alphabet portion of the XML prior to using this * converter. * * Implementation details: - we add a state for <s> to the end of both * symbol alphabets - we increment all state IDs in the states and in the arcs - * we add a new zero'th state which transitions via <s>:<s> to the * (new) first state * * @author Johannes Twiefel, Timo Baumann */ public class SequiturImport { @XmlRootElement(name = "fsa") public static class FSA { @XmlAttribute String semiring; @XmlAttribute int initial; // first real state @XmlElement(name = "input-alphabet") Alphabet inputAlphabet; @XmlElement(name = "output-alphabet") Alphabet outputAlphabet; @XmlElement(name = "state") List<State> states; transient List<edu.cmu.sphinx.fst.State> openFstStates; transient Semiring ring = new TropicalSemiring(); public void afterUnmarshal(Unmarshaller unmarshaller, Object parent) { // might also work with other formats, but we have never seen any // other format assert "tropical".equals(semiring); // add a new initial state that transduces <s>:<s> and transitions // to the first real state State initialState = new State(); initialState.id = 0; Arc initialArc = new Arc(); initialArc.in = inputAlphabet.symbols.size() - 1; initialArc.out = outputAlphabet.symbols.size() - 1; initialArc.target = initial + 1; initialArc.weight = ring.one(); initialState.arcs = Collections.<Arc> singletonList(initialArc); states.add(initialState); // sort the states (to ascertain that initialState is the first // element) Collections.<State> sort(states, new Comparator<State>() { public int compare(State s1, State s2) { return s1.id - s2.id; } }); } /** * convert our object to the Sphinx OpenFst data structure * * @return an edu.cmu.sphinx.fst.Fst built from the XML */ public Fst toFst() { Fst openFst = new Fst(ring); openFst.setIsyms(inputAlphabet.toSymbols()); openFst.setOsyms(outputAlphabet.toSymbols()); openFstStates = new ArrayList<edu.cmu.sphinx.fst.State>( states.size()); for (State state : states) { edu.cmu.sphinx.fst.State openFstState = state .toUnconnectedOpenFstState(); openFst.addState(openFstState); assert openFstState.getId() == state.id; openFstStates.add(openFstState); } openFst.setStart(openFstStates.get(0)); // second pass (now that all openFst states are created) to add all // the openFst arcs for (State state : states) { state.connectStates(openFstStates); } return openFst; } } public static class Alphabet { @XmlElement(name = "symbol") List<Symbol> symbols; public void afterUnmarshal(Unmarshaller unmarshaller, Object parent) { Iterator<Symbol> it = symbols.iterator(); while (it.hasNext()) { if (it.next().content.matches("__\\d+__")) it.remove(); } for (int i = 0; i < symbols.size(); i++) { assert symbols.get(i).index != null; assert symbols.get(i).index == i; symbols.get(i).index = null; } Symbol s = new Symbol(); s.content = "<s>"; symbols.add(s); } String[] toSymbols() { String[] out = new String[symbols.size()]; for (int i = 0; i < out.length; i++) { out[i] = symbols.get(i).content; } return out; } } public static class Symbol { @XmlAttribute Integer index; @XmlMixed List<String> contentList; transient String content; public void afterUnmarshal(Unmarshaller unmarshaller, Object parent) { assert contentList != null : "Error with symbol " + index; assert contentList.size() == 1 : "Error with symbol " + index; this.content = contentList.get(0); if (content.equals("__term__")) { content = "</s>"; } else if (content.matches("__.+__")) { content = "<eps>"; } } } public static class State { @XmlAttribute int id; @XmlElement(name = "final") Object finalState; @XmlElement Float weight; @XmlElement(name = "arc") List<Arc> arcs; public void afterUnmarshal(Unmarshaller unmarshaller, Object parent) { id++; // increment state ID (because we add a new initial state 0 // transitioning via <s>:<s>) } /** * @return a first approximation State which does not yet incoroporate * arcs */ public edu.cmu.sphinx.fst.State toUnconnectedOpenFstState() { return new edu.cmu.sphinx.fst.State(weight != null ? weight : 0.f); } /** * add arcs to the state now that all states are available as possible * targets * @param openFstStates source states */ public void connectStates(List<edu.cmu.sphinx.fst.State> openFstStates) { if (arcs != null) for (Arc arc : arcs) { edu.cmu.sphinx.fst.Arc openFstArc = arc .toOpenFstArc(openFstStates); openFstStates.get(id).addArc(openFstArc); } } } public static class Arc { @XmlAttribute int target; @XmlElement int in; // automatically set to 0 (which corresponds to epsilon) if not // set in XML @XmlElement int out; // automatically set to 0 (which corresponds to epsilon) if not // set in XML @XmlElement float weight; public void afterUnmarshal(Unmarshaller unmarshaller, Object parent) { target++; // increment state ID (because we add a new initial state // 0 transitioning via <s>:<s>) } public edu.cmu.sphinx.fst.Arc toOpenFstArc( List<edu.cmu.sphinx.fst.State> openFstStates) { return new edu.cmu.sphinx.fst.Arc(in, out, weight, openFstStates.get(target)); } } /** * Load a Sequitur FSA in XML format and store it in Sphinx' OpenFst * binary/serialized format. * * @param args * filename of input file, filename of output file * @throws JAXBException * indicating that XML could not be read * @throws IOException * indicating that file-handling does not work */ public static void main(String... args) throws JAXBException, IOException { JAXBContext context = JAXBContext.newInstance(FSA.class); Unmarshaller unmarshaller = context.createUnmarshaller(); FSA fsa = (FSA) unmarshaller.unmarshal(new File(args[0])); edu.cmu.sphinx.fst.Fst fst = fsa.toFst(); fst.saveModel(args[1]); System.out .println("The Sequitur G2P XML-formatted FST " + args[0] + " has been converted to Sphinx' OpenFst binary format in the file " + args[1]); // uncomment this to test your model: // edu.cmu.sphinx.linguist.g2p.G2PConverter d = new // edu.cmu.sphinx.linguist.g2p.G2PConverter(args[1]); // List<edu.cmu.sphinx.linguist.g2p.Path> path = // d.phoneticize("wahnsinn", 5); // System.err.println(path); } }