/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.modules; import java.io.InputStream; import marytts.datatypes.MaryData; import marytts.datatypes.MaryDataType; import marytts.datatypes.MaryXML; import marytts.fst.FSTLookup; import marytts.server.MaryProperties; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.traversal.NodeIterator; import org.w3c.dom.traversal.TreeWalker; /** * Minimalistic part-of-speech tagger, using only function word tags as marked in the Transcription GUI. * * @author Sathish Pammi * @author Marc Schröder */ public class MinimalisticPosTagger extends InternalModule { private String propertyPrefix; private FSTLookup posFST = null; private String punctuationList; /** * Constructor which can be directly called from init info in the config file. Different languages can call this code with * different settings. * * @param locale * a locale string, e.g. "en" * @param propertyPrefix * propertyPrefix * @throws Exception * Exception */ public MinimalisticPosTagger(String locale, String propertyPrefix) throws Exception { super("OpenNLPPosTagger", MaryDataType.WORDS, MaryDataType.PARTSOFSPEECH, MaryUtils.string2locale(locale)); if (!propertyPrefix.endsWith(".")) propertyPrefix = propertyPrefix + "."; this.propertyPrefix = propertyPrefix + "partsofspeech."; } public void startup() throws Exception { super.startup(); InputStream posFSTStream = MaryProperties.getStream(propertyPrefix + "fst"); if (posFSTStream != null) { posFST = new FSTLookup(posFSTStream, MaryProperties.getProperty(propertyPrefix + "fst")); } punctuationList = MaryProperties.getProperty(propertyPrefix + "punctuation", ",.?!;"); } public MaryData process(MaryData d) throws Exception { Document doc = d.getDocument(); NodeIterator sentenceIt = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.SENTENCE); Element sentence; while ((sentence = (Element) sentenceIt.nextNode()) != null) { TreeWalker tokenIt = MaryDomUtils.createTreeWalker(sentence, MaryXML.TOKEN); Element t; while ((t = (Element) tokenIt.nextNode()) != null) { String pos = "content"; String tokenText = MaryDomUtils.tokenText(t); if (punctuationList.contains(tokenText)) { pos = "$PUNCT"; } else if (posFST != null) { String[] result = posFST.lookup(tokenText); if (result.length != 0) pos = "function"; } t.setAttribute("pos", pos); } } MaryData output = new MaryData(outputType(), d.getLocale()); output.setDocument(doc); return output; } }