/**
* Copyright 2002 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.de.preprocess;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
/**
* An expansion pattern implementation for specialChar patterns.
*
* @author Marc Schröder
*/
public class SpecialCharEP extends ExpansionPattern {
private final String[] _knownTypes = { "specialChar" };
/**
* Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of
* the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one
* known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are
* specializations.
*/
private final List<String> knownTypes = Arrays.asList(_knownTypes);
public List<String> knownTypes() {
return knownTypes;
}
/** Helper class for the specialCharNames map */
class SCEntry {
/** The expanded form of this special character. */
protected String expand;
/**
* Determines whether a symbol, when found in a token, will cause the token to be split into its parts.
*/
protected boolean splitAt;
/**
* Determines whether it will be pronounced when found as a single token after all other expansion patterns have been
* applied.
*/
protected boolean pronounce;
protected SCEntry(String expand, boolean splitAt, boolean pronounce) {
this.expand = expand;
this.splitAt = splitAt;
this.pronounce = pronounce;
}
}
/** Only needed to fill specialCharNames */
private Map<String, SCEntry> createSpecialCharNames() {
HashMap<String, SCEntry> m = new HashMap<String, SCEntry>();
m.put(",", new SCEntry("Komma", false, false));
m.put("\\", new SCEntry("Backslash['bEk-slES]", true, true));
m.put("!", new SCEntry("Ausrufezeichen", false, false));
m.put("#", new SCEntry("Numerical[nu:-'mE-rI_k@l]", true, true));
m.put("$", new SCEntry("Dollar", false, true));
m.put(new Character((char) 167).toString(), new SCEntry("Paragraph", true, true));
m.put("%", new SCEntry("Prozent", false, true));
m.put(new Character((char) 8364).toString(), new SCEntry("Euro", false, true));
m.put("&", new SCEntry("und", true, true));
m.put("'", new SCEntry("Hochkomma", true, false));
m.put("*", new SCEntry("Stern", true, true));
m.put("+", new SCEntry("plus", true, true));
m.put("-", new SCEntry("Bindestrich", false, false));
m.put("/", new SCEntry("Slash['slES]", true, false));
m.put("=", new SCEntry("gleich", true, true));
m.put("?", new SCEntry("Fragezeichen", true, false));
m.put("^", new SCEntry("Dach", true, false));
m.put("_", new SCEntry("Unterstrich", true, false));
m.put("`", new SCEntry("Hochkomma", true, false));
m.put("{", new SCEntry("geschweifte Klammer auf", true, false));
m.put("|", new SCEntry("senkrechter Strich", true, false));
m.put("}", new SCEntry("geschweifte Klammer zu", true, false));
m.put("~", new SCEntry("Tilde", true, true));
m.put("(", new SCEntry("Klammer auf", true, false));
m.put(")", new SCEntry("Klammer zu", true, false));
m.put("[", new SCEntry("eckige Klammer auf", true, false));
m.put("]", new SCEntry("eckige Klammer zu", true, false));
m.put("@", new SCEntry("at['Et]", false, true));
m.put(":", new SCEntry("Doppelpunkt", false, false));
m.put(";", new SCEntry("Semikolon", true, false));
m.put("\"", new SCEntry("Anführungszeichen", true, false));
m.put("<", new SCEntry("kleiner als", true, true));
m.put(">", new SCEntry("größer als", true, true));
m.put(".", new SCEntry("Punkt", false, false));
return m;
};
private final Map<String, SCEntry> specialCharNames = createSpecialCharNames();
protected final String sMatchingChars = createMatchingChars();
// protected final String sMatchingChars =
// "[\\,\\\\\\!\\#\\$\\%\\&\\'\\*\\+\\-\\/\\=\\?\\^\\_\\`\\{\\|\\}\\~\\(\\)\\[\\]\\@\\:\\;\\\"\\<\\>\\.]";
protected final String sMatchingCharsSimpleString = createMatchingCharsSimpleString();
// protected final String sMatchingCharsSimpleString = ",\\!#$%&'*+-/=?^_`{|}~()[]@:;\"<>.";
private final String sSplitAtChars = createSplitAtChars();
private final String sSplitAtCharsSimpleString = createSplitAtCharsSimpleString();
/**
* Only needed to fill sMatchingChars from specialCharNames
*
* @return sb.toString
*/
private String createMatchingChars() {
StringBuilder sb = new StringBuilder("[");
for (Iterator<String> it = specialCharNames.keySet().iterator(); it.hasNext();) {
sb.append("\\" + (String) it.next());
}
sb.append("]");
return sb.toString();
}
/**
* Only needed to fill sMatchingCharsSimpleString from _specialCharNames[]
*
* @return sb.toString
*/
private String createMatchingCharsSimpleString() {
StringBuilder sb = new StringBuilder();
for (Iterator<String> it = specialCharNames.keySet().iterator(); it.hasNext();) {
sb.append((String) it.next());
}
return sb.toString();
}
/**
* Only needed to fill sSplitAtChars from _specialCharNames[]
*
* @return sb.toString
*/
private String createSplitAtChars() {
StringBuilder sb = new StringBuilder("[");
for (Iterator<String> it = specialCharNames.keySet().iterator(); it.hasNext();) {
String sc = (String) it.next();
if (((SCEntry) specialCharNames.get(sc)).splitAt) {
sb.append("\\" + sc);
}
}
sb.append("]");
return sb.toString();
}
/**
* Only needed to fill sSplitAtCharsSimpleString from _specialCharNames[]
*
* @return sb.toString
*/
private String createSplitAtCharsSimpleString() {
StringBuilder sb = new StringBuilder();
for (Iterator<String> it = specialCharNames.keySet().iterator(); it.hasNext();) {
String sc = (String) it.next();
if (((SCEntry) specialCharNames.get(sc)).splitAt) {
sb.append(sc);
}
}
return sb.toString();
}
private final Pattern reMatchingChars = Pattern.compile(sMatchingChars);
public Pattern reMatchingChars() {
return reMatchingChars;
}
private final Pattern reSplitAtChars = Pattern.compile(sSplitAtChars);
/**
* A regular expression matching the characters at which a token should be split into parts before any preprocessing patterns
* are applied.
*
* @return reSplitAtChars
*/
protected Pattern getRESplitAtChars() {
return reSplitAtChars;
}
/**
* A string containing the characters at which a token should be split into parts before any preprocessing patterns are
* applied.
*
* @return sSplitAtCharsSimpleString
*/
protected String splitAtChars() {
return sSplitAtCharsSimpleString;
}
/**
* Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same
* time, the logger needs to be thread-safe or it will produce rubbish.
*/
// private Logger logger = MaryUtils.getLogger("SpecialCharEP");
public SpecialCharEP() {
super();
}
protected int canDealWith(String s, int type) {
return match(s, type);
}
protected int match(String s, int type) {
switch (type) {
case 0:
if (matchSpecialChar(s))
return 0;
break;
}
return -1;
}
protected List<Element> expand(List<Element> tokens, String s, int type) {
if (tokens == null)
throw new NullPointerException("Received null argument");
if (tokens.isEmpty())
throw new IllegalArgumentException("Received empty list");
Document doc = ((Element) tokens.get(0)).getOwnerDocument();
// we expect type to be one of the return values of match():
List<Element> expanded = null;
switch (type) {
case 0:
expanded = expandSpecialChar(doc, s);
break;
}
if (expanded != null && !expanded.isEmpty())
replaceTokens(tokens, expanded);
return expanded;
}
/**
* Tell whether String <code>s</code> is a specialChar.
*
* @param s
* s
* @return reMatchingChars.matcher(s).matches
*/
public boolean matchSpecialChar(String s) {
return reMatchingChars.matcher(s).matches();
}
protected boolean doPronounce(String specialChar) {
SCEntry entry = (SCEntry) specialCharNames.get(specialChar);
if (entry == null)
return false;
return entry.pronounce;
}
protected List<Element> expandSpecialChar(Document doc, String s) {
ArrayList<Element> exp = new ArrayList<Element>();
if (doPronounce(s)) {
String specialCharName = expandSpecialChar(s);
exp.addAll(makeNewTokens(doc, specialCharName, true, s));
} // if not to be pronounced, return an empty list
return exp;
}
protected String expandSpecialChar(String s) {
SCEntry entry = (SCEntry) specialCharNames.get(s);
if (entry == null)
return null;
return entry.expand;
}
}