/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.de.preprocess; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import java.util.regex.Pattern; import marytts.datatypes.MaryXML; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.w3c.dom.Document; import org.w3c.dom.Element; /** * An expansion pattern implementation for abbreviation patterns. * * @author Marc Schröder */ public class AbbrevEP extends ExpansionPattern { private final String[] _knownTypes = { "acronym" }; private final List<String> knownTypes = Arrays.asList(_knownTypes); public List<String> knownTypes() { return knownTypes; } private static final Map<String, String[]> abbrevDict = new HashMap<String, String[]>(); // We don't use sMatchingChars here, but override isCandidate(). private final Pattern reMatchingChars = null; public Pattern reMatchingChars() { return reMatchingChars; } private static final Logger logger = MaryUtils.getLogger("AbbrevEP"); static { try { loadAbbrevDict(); } catch (FileNotFoundException e) { logger.warn("Could not load abbreviation file", e); } catch (IOException e) { logger.warn("Could not load abbreviation file", e); } } public AbbrevEP() { super(); } @Override protected boolean isCandidate(Element t) { String str = MaryDomUtils.tokenText(t); return isAbbrev(str) || REPattern.onlyDigits.matcher(str).find() || ".".equals(str); } protected int canDealWith(String s, int type) { return match(s, type); } protected int match(String s, int type) { if (s.length() > 1 && isAbbrev(s)) return type; return -1; } protected boolean isAbbrev(String s) { boolean isLetterDot = REPattern.letterDot.matcher(s).find(); boolean isNonInitialCapital = REPattern.nonInitialCapital.matcher(s).find(); boolean isOnlyConsonants = REPattern.onlyConsonants.matcher(s).find(); boolean isInDict = abbrevDict.containsKey(s) || abbrevDict.containsKey(s + "."); return isLetterDot || isNonInitialCapital || isOnlyConsonants || isInDict; } /** * Expand abbreviations and eventually replace them by <code>mtu</code> structures (for multi-token abbreviations). * * @param tokens * tokens * @param s * s * @param type * type * @return expanded */ protected List<Element> expand(List<Element> tokens, String s, int type) { if (tokens == null) throw new NullPointerException("Received null argument"); if (tokens.isEmpty()) throw new IllegalArgumentException("Received empty list"); // we expect type to be one of the return values of match(): List<Element> expanded = null; expanded = expandAbbrev(tokens); replaceTokens(tokens, expanded); return expanded; } /** * Expand the abbreviation list <code>abbr</code>. First, try to find longest entries in database, then shorter. If no entry * was found, expand by rule. Return the list of newly created, but not yet attached tokens. * * @param abbrTokens * abbrTokens * @return exp */ private List<Element> expandAbbrev(List<Element> abbrTokens) { ArrayList<Element> exp = new ArrayList<Element>(); ArrayList<Element> abbr = new ArrayList<Element>(abbrTokens); ArrayList<Element> match = new ArrayList<Element>(abbr); boolean tryLowerCase = false; if (MaryDomUtils.isFirstOfItsKindIn((Element) abbr.get(0), MaryXML.SENTENCE) && REPattern.initialCapitalLetter.matcher(MaryDomUtils.tokenText((Element) abbr.get(0))).find()) { // At sentence start, maybe need to lowercase first char // before matching tryLowerCase = true; } StringBuilder sb = new StringBuilder(); while (!match.isEmpty()) { sb.setLength(0); Iterator<Element> it = match.iterator(); while (it.hasNext()) { sb.append(MaryDomUtils.tokenText((Element) it.next())); } logger.debug("Looking up abbreviation in dictionary: `" + sb.toString() + "'"); if (abbrevDict.containsKey(sb.toString())) { break; // OK, found a match } if (tryLowerCase) { sb.setCharAt(0, Character.toLowerCase(sb.charAt(0))); logger.debug("Looking up abbreviation in dictionary: `" + sb.toString() + "'"); if (abbrevDict.containsKey(sb.toString())) break; // OK, found a match } // Try to append a dot: sb.append("."); logger.debug("Looking up abbreviation in dictionary: `" + sb.toString() + "'"); if (abbrevDict.containsKey(sb.toString())) { break; // OK, found a match } match.remove(match.size() - 1); // remove last in list } if (!match.isEmpty()) { // found an abbrevDict entry exp.addAll(dictionaryExpandAbbrev(match, sb.toString())); abbr.removeAll(match); logger.debug("Have found abbreviation in dictionary: `" + sb.toString() + "'"); } else { // no abbrevDict entry - expand one token by rule Element token = (Element) abbr.get(0); // Verify that token consists of more than a single character: String text = MaryDomUtils.tokenText(token); // Only digits? Pronounce as an integer. if (REPattern.onlyDigits.matcher(text).find()) { if (Pattern.matches(NumberEP.sInteger, text)) { logger.debug("Expanding as integer: `" + text + "'"); exp.addAll(makeNewTokens(token.getOwnerDocument(), number.expandInteger(text), true, text)); } else { logger.debug("Expanding as digits: `" + text + "'"); exp.addAll(makeNewTokens(token.getOwnerDocument(), number.expandDigits(text), true, text)); } } else if (text.length() > 1) { logger.debug("Expanding one token by rule: `" + text + "'"); // Slow down the mtu containing this token (or the token // itself), so the spelling is understandable. slowDown(token); // And now expand: exp.addAll(ruleExpandAbbrev(token)); } else { // only single character // Need to copy the character into a new XML document, // otherwise replaceTokens will kill it. exp.addAll(makeNewTokens(token.getOwnerDocument(), text)); } abbr.remove(0); } if (!abbr.isEmpty()) exp.addAll(expandAbbrev(abbr)); if (logger.getEffectiveLevel().equals(Level.DEBUG)) { StringBuilder logBuf = new StringBuilder(); for (Iterator<Element> it = exp.iterator(); it.hasNext();) { Element elt = (Element) it.next(); if (elt.getTagName().equals(MaryXML.TOKEN)) { logBuf.append(MaryDomUtils.tokenText(elt)); } else { logBuf.append(elt.getTagName()); } logBuf.append(" "); } logger.debug("Expanded abbreviation: " + logBuf.toString()); } return exp; } /** * Expand a recognised abbreviation from the dictionary. <code>match</code> is the list of token elements forming the * abbreviation; <code>abbrev</code> is a string representation of that abbreviation. Tokens for the expanded form are * created, but not yet attached to the dom tree. * * @param match * match * @param abbrev * abbrev * @return exp */ private List<Element> dictionaryExpandAbbrev(List<Element> match, String abbrev) { Document doc = ((Element) match.get(0)).getOwnerDocument(); ArrayList<Element> exp = new ArrayList<Element>(); String[] value = (String[]) abbrevDict.get(abbrev); String flex = value[0]; // inflection info String graph = value[1]; // expanded form, possibly with pronunciation // For Sentence-initial abbreviation, make sure the expanded // form starts with a capital letter. if (MaryDomUtils.isFirstOfItsKindIn((Element) match.get(0), "div") && REPattern.initialLowercaseLetter.matcher(graph).find()) { StringBuilder sb = new StringBuilder(graph); sb.setCharAt(0, Character.toUpperCase(sb.charAt(0))); graph = sb.toString(); // And while we're at it, correct abbrev because we need it // for the mtu tag later. sb.setLength(0); sb.append(abbrev); sb.setCharAt(0, Character.toUpperCase(sb.charAt(0))); abbrev = sb.toString(); } exp.addAll(makeNewTokens(doc, graph, true, abbrev)); if (exp.isEmpty()) return exp; // Now some post-hoc modification of the first expanded token: if (flex != null && flex.length() > 0) { Element t = (Element) exp.get(0); while (t != null && !t.getTagName().equals(MaryXML.TOKEN)) t = MaryDomUtils.getFirstChildElement(t); if (t != null) { String firstWord = MaryDomUtils.tokenText(t); // First expanded word gets the inflection info // (usually only used in one-word expansions). t.setAttribute("ending", flex); // If we have an inflexion ending, keep the abbreviated form // as the graphemic form (for pos-tagger and chunker), and // save the expanded form in the `sounds_like' attribute. t.setAttribute("sounds_like", firstWord); MaryDomUtils.setTokenText(t, abbrev); } } return exp; } protected List<Element> ruleExpandAbbrev(Element token) { Document doc = token.getOwnerDocument(); String orig = MaryDomUtils.tokenText(token); String expandedString = ruleExpandAbbrev(orig, false); // do not say specialChar // Force an accent on every item in this mtu, // just to make sure the token in the mtu whose accent // is to be retained actually *has* an accent. return makeNewTokens(doc, expandedString, true, orig, true); } protected String ruleExpandAbbrev(String orig, boolean saySpecialChar) { // Spell out if <= 5 letters: if (orig.indexOf('.') == -1 && // not dot and orig.length() <= 5 || // maximally five letters or orig.indexOf('.') == orig.length() - 1 && // final dot and orig.length() <= 6) { // maximally five letters + dot return spellOutAbbrev(orig, saySpecialChar); } // Contains dot or other specialChar: else if (specialChar.reMatchingChars().matcher(orig).find()) { StringBuilder sb = new StringBuilder(); StringTokenizer st = new StringTokenizer(orig, specialChar.sMatchingCharsSimpleString, saySpecialChar); // If saySpecialChar is true, st will return specialChar signs // as individual tokens of length one. while (st.hasMoreTokens()) { // recursive call: sb.append(ruleExpandAbbrev(st.nextToken(), saySpecialChar)); sb.append(" "); } return sb.toString().trim(); } else { // Else, no dot and too long for spelling - print as is return orig; } } /** * Spell out a token. If saySpecialChar is true, specialChar is pronounced as well; otherwise, it is silently ignored. The * spelled out version is returned as a String in which individual tokens are separated by a space character. * * @param s * s * @param saySpecialChar * saySpecialChar * @return sb.toString().trim() */ private String spellOutAbbrev(String s, boolean saySpecialChar) { if (s == null || s.length() == 0) // nothing to do return s; StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { if (specialChar.matchSpecialChar(s.substring(i, i + 1))) { // A specialChar character if (saySpecialChar) { sb.append(specialChar.expandSpecialChar(s.substring(i, i + 1))); sb.append(" "); } } else { // a normal letter if ((i + 2 == s.length() || i + 2 < s.length() && specialChar.matchSpecialChar(s.substring(i + 2, i + 3))) && Character.isUpperCase(s.charAt(i)) && s.charAt(i + 1) == 's') { // This is the second-to-last character, it is an uppercase // letter, and the last one is a lowercase 's' => treat the // s like a plural and attach it to this letter's // pronunciation. sb.append(s.substring(i, i + 1)); sb.append("[*s]"); i++; } else { sb.append(s.substring(i, i + 1)); sb.append(" "); } } } return sb.toString().trim(); } private static void loadAbbrevDict() throws FileNotFoundException, IOException { InputStream abbrevStream = AbbrevEP.class.getResourceAsStream("abbrev.dat"); BufferedReader br = new BufferedReader(new InputStreamReader(abbrevStream, "UTF-8")); String line; while ((line = br.readLine()) != null) { if (Pattern.compile("^\\#").matcher(line).find() || REPattern.emptyLine.matcher(line).find()) { // comment or empty line, ignore continue; } // Fields separated by a slash (/): StringTokenizer st = new StringTokenizer(line, "/"); // Each line contains three fields, // key (the abbreviation), // flex (optional inflection information), // and graph (the graphemic (and possibly phonemic) expanded form. // Remove leading/trailing whitespace from each field. String key = st.nextToken().trim(); String flex = st.nextToken().trim(); String graph = st.nextToken().trim(); // In addition, replace all whitespace in graph by a single blank graph = graph.replaceAll("\\s+", " "); // Now key should not contain any whitespace: if (Pattern.compile("\\s").matcher(key).find()) { logger.info("In abbrev.dat: Abbreviation \"" + key + "\" contains whitespace. Ignoring."); continue; } // In the hashmap, save a reference to an array containing // two elements: // 1. The inflection information, if any, and // 2. The replacement token(s) as one string. String[] value = new String[2]; value[0] = flex; value[1] = graph; abbrevDict.put(key, value); } } }