/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.it.preprocess; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import marytts.util.MaryUtils; import org.apache.log4j.Logger; import org.w3c.dom.Document; import org.w3c.dom.Element; /** * An expansion pattern implementation for specialChar patterns. * * @author Marc Schröder */ public class SpecialCharEP extends ExpansionPattern { private final String[] _knownTypes = { "specialChar" }; /** * Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of * the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one * known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are * specialisations. */ private final List knownTypes = Arrays.asList(_knownTypes); public List knownTypes() { return knownTypes; } /** Helper class for the specialCharNames map */ class SCEntry { /** The expanded form of this special character. */ protected String expand; /** * Determines whether a symbol, when found in a token, will cause the token to be split into its parts. */ protected boolean splitAt; /** * Determines whether it will be pronounced when found as a single token after all other expansion patterns have been * applied. */ protected boolean pronounce; protected SCEntry(String expand, boolean splitAt, boolean pronounce) { this.expand = expand; this.splitAt = splitAt; this.pronounce = pronounce; } } /** * Only needed to fill specialCharNames * * @return m */ private Map createSpecialCharNames() { HashMap m = new HashMap(); m.put("$", new SCEntry("dollaro", false, true)); m.put("@", new SCEntry("chiocciola", true, true)); /* * m.put("'", new SCEntry("apostrofoXXX", false, true)); m.put(":", new SCEntry("duepuntiXXX", true, true)); */ /* * m.put(",", new SCEntry("Komma", false, false)); m.put("\\", new SCEntry("Backslash['bEk-slES]", true, true)); * m.put("!", new SCEntry("Ausrufezeichen", false, false)); m.put("#", new SCEntry("Numerical[nu:-'mE-rI_k@l]", true, * true)); m.put("$", new SCEntry("Dollar", false, true)); m.put(new Character((char)167).toString(), new * SCEntry("Paragraph", true, true)); m.put("%", new SCEntry("Prozent", false, true)); m.put(new * Character((char)8364).toString(), new SCEntry("Euro", false, true)); m.put("&", new SCEntry("und", true, true)); * m.put("'", new SCEntry("Hochkomma", true, false)); m.put("*", new SCEntry("Stern", true, true)); m.put("+", new * SCEntry("plus", true, true)); m.put("-", new SCEntry("Bindestrich", false, false)); m.put("/", new * SCEntry("Slash['slES]", true, false)); m.put("=", new SCEntry("gleich", true, true)); m.put("?", new * SCEntry("Fragezeichen", true, false)); m.put("^", new SCEntry("Dach", true, false)); m.put("_", new * SCEntry("Unterstrich", true, false)); m.put("`", new SCEntry("Hochkomma", true, false)); m.put("{", new * SCEntry("geschweifte Klammer auf", true, false)); m.put("|", new SCEntry("senkrechter Strich", true, false)); * m.put("}", new SCEntry("geschweifte Klammer zu", true, false)); m.put("~", new SCEntry("Tilde", true, true)); * m.put("(", new SCEntry("Klammer auf", true, false)); m.put(")", new SCEntry("Klammer zu", true, false)); m.put("[", new * SCEntry("eckige Klammer auf", true, false)); m.put("]", new SCEntry("eckige Klammer zu", true, false)); m.put("@", new * SCEntry("at['Et]", false, true)); m.put(":", new SCEntry("Doppelpunkt", false, false)); m.put(";", new * SCEntry("Semikolon", true, false)); m.put("\"", new SCEntry("Anführungszeichen", true, false)); m.put("<", new * SCEntry("kleiner als", true, true)); m.put(">", new SCEntry("größer als", true, true)); m.put(".", new SCEntry("Punkt", * false, false)); */ return m; }; private final Map specialCharNames = createSpecialCharNames(); protected final String sMatchingChars = createMatchingChars(); // protected final String sMatchingChars = // "[\\,\\\\\\!\\#\\$\\%\\&\\'\\*\\+\\-\\/\\=\\?\\^\\_\\`\\{\\|\\}\\~\\(\\)\\[\\]\\@\\:\\;\\\"\\<\\>\\.]"; protected final String sMatchingCharsSimpleString = createMatchingCharsSimpleString(); // protected final String sMatchingCharsSimpleString = ",\\!#$%&'*+-/=?^_`{|}~()[]@:;\"<>."; private final String sSplitAtChars = createSplitAtChars(); private final String sSplitAtCharsSimpleString = createSplitAtCharsSimpleString(); /** * Only needed to fill sMatchingChars from specialCharNames * * @return sb.toString */ private String createMatchingChars() { StringBuilder sb = new StringBuilder("["); for (Iterator it = specialCharNames.keySet().iterator(); it.hasNext();) { sb.append("\\" + (String) it.next()); } sb.append("]"); return sb.toString(); } /** Only needed to fill sMatchingCharsSimpleString from _specialCharNames[] */ private String createMatchingCharsSimpleString() { StringBuilder sb = new StringBuilder(); for (Iterator it = specialCharNames.keySet().iterator(); it.hasNext();) { sb.append((String) it.next()); } return sb.toString(); } /** * Only needed to fill sSplitAtChars from _specialCharNames[] * * @return sb.toString */ private String createSplitAtChars() { StringBuilder sb = new StringBuilder("["); for (Iterator it = specialCharNames.keySet().iterator(); it.hasNext();) { String sc = (String) it.next(); if (((SCEntry) specialCharNames.get(sc)).splitAt) { sb.append("\\" + sc); } } sb.append("]"); return sb.toString(); } /** * Only needed to fill sSplitAtCharsSimpleString from _specialCharNames[] * * @return sb.toString */ private String createSplitAtCharsSimpleString() { StringBuilder sb = new StringBuilder(); for (Iterator it = specialCharNames.keySet().iterator(); it.hasNext();) { String sc = (String) it.next(); if (((SCEntry) specialCharNames.get(sc)).splitAt) { sb.append(sc); } } return sb.toString(); } private final Pattern reMatchingChars = Pattern.compile(sMatchingChars); public Pattern reMatchingChars() { return reMatchingChars; } private final Pattern reSplitAtChars = Pattern.compile(sSplitAtChars); /** * A regular expression matching the characters at which a token should be split into parts before any preprocessing patterns * are applied. * * @return reSplitAtChars */ protected Pattern getRESplitAtChars() { return reSplitAtChars; } /** * A string containing the characters at which a token should be split into parts before any preprocessing patterns are * applied. * * @return sSplitAtCharsSimpleString */ protected String splitAtChars() { return sSplitAtCharsSimpleString; } /** * Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same * time, the logger needs to be thread-safe or it will produce rubbish. */ private Logger logger = MaryUtils.getLogger("SpecialCharEP"); public SpecialCharEP() { super(); } protected int canDealWith(String s, int type) { return match(s, type); } protected int match(String s, int type) { switch (type) { case 0: if (matchSpecialChar(s)) return 0; break; } return -1; } protected List expand(List tokens, String s, int type) { if (tokens == null) throw new NullPointerException("Received null argument"); if (tokens.isEmpty()) throw new IllegalArgumentException("Received empty list"); Document doc = ((Element) tokens.get(0)).getOwnerDocument(); // we expect type to be one of the return values of match(): List expanded = null; switch (type) { case 0: expanded = expandSpecialChar(doc, s); break; } if (expanded != null && !expanded.isEmpty()) replaceTokens(tokens, expanded); return expanded; } /** * Tell whether String <code>s</code> is a specialChar. * * @param s * the string * @return reMatchingChars.matcher(s).matches() */ public boolean matchSpecialChar(String s) { return reMatchingChars.matcher(s).matches(); } protected boolean doPronounce(String specialChar) { SCEntry entry = (SCEntry) specialCharNames.get(specialChar); if (entry == null) return false; return entry.pronounce; } protected List expandSpecialChar(Document doc, String s) { ArrayList exp = new ArrayList(); if (doPronounce(s)) { String specialCharName = expandSpecialChar(s); exp.addAll(makeNewTokens(doc, specialCharName, true, s)); } // if not to be pronounced, return an empty list return exp; } protected String expandSpecialChar(String s) { SCEntry entry = (SCEntry) specialCharNames.get(s); if (entry == null) return null; return entry.expand; } }