AbbrevEP.java example

Explorer
marytts-master
/**
 * Copyright 2002 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.language.de.preprocess;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import marytts.datatypes.MaryXML;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

/**
 * An expansion pattern implementation for abbreviation patterns.
 *
 * @author Marc Schröder
 */

public class AbbrevEP extends ExpansionPattern {
	private final String[] _knownTypes = { "acronym" };
	private final List<String> knownTypes = Arrays.asList(_knownTypes);

	public List<String> knownTypes() {
		return knownTypes;
	}

	private static final Map<String, String[]> abbrevDict = new HashMap<String, String[]>();

	// We don't use sMatchingChars here, but override isCandidate().
	private final Pattern reMatchingChars = null;

	public Pattern reMatchingChars() {
		return reMatchingChars;
	}

	private static final Logger logger = MaryUtils.getLogger("AbbrevEP");

	static {
		try {
			loadAbbrevDict();
		} catch (FileNotFoundException e) {
			logger.warn("Could not load abbreviation file", e);
		} catch (IOException e) {
			logger.warn("Could not load abbreviation file", e);
		}

	}

	public AbbrevEP() {
		super();
	}

	@Override
	protected boolean isCandidate(Element t) {
		String str = MaryDomUtils.tokenText(t);
		return isAbbrev(str) || REPattern.onlyDigits.matcher(str).find() || ".".equals(str);
	}

	protected int canDealWith(String s, int type) {
		return match(s, type);
	}

	protected int match(String s, int type) {
		if (s.length() > 1 && isAbbrev(s))
			return type;
		return -1;
	}

	protected boolean isAbbrev(String s) {
		boolean isLetterDot = REPattern.letterDot.matcher(s).find();
		boolean isNonInitialCapital = REPattern.nonInitialCapital.matcher(s).find();
		boolean isOnlyConsonants = REPattern.onlyConsonants.matcher(s).find();
		boolean isInDict = abbrevDict.containsKey(s) || abbrevDict.containsKey(s + ".");
		return isLetterDot || isNonInitialCapital || isOnlyConsonants || isInDict;
	}

	/**
	 * Expand abbreviations and eventually replace them by <code>mtu</code> structures (for multi-token abbreviations).
	 * 
	 * @param tokens
	 *            tokens
	 * @param s
	 *            s
	 * @param type
	 *            type
	 * @return expanded
	 */
	protected List<Element> expand(List<Element> tokens, String s, int type) {
		if (tokens == null)
			throw new NullPointerException("Received null argument");
		if (tokens.isEmpty())
			throw new IllegalArgumentException("Received empty list");
		// we expect type to be one of the return values of match():
		List<Element> expanded = null;
		expanded = expandAbbrev(tokens);
		replaceTokens(tokens, expanded);
		return expanded;
	}

	/**
	 * Expand the abbreviation list <code>abbr</code>. First, try to find longest entries in database, then shorter. If no entry
	 * was found, expand by rule. Return the list of newly created, but not yet attached tokens.
	 * 
	 * @param abbrTokens
	 *            abbrTokens
	 * @return exp
	 */
	private List<Element> expandAbbrev(List<Element> abbrTokens) {
		ArrayList<Element> exp = new ArrayList<Element>();
		ArrayList<Element> abbr = new ArrayList<Element>(abbrTokens);
		ArrayList<Element> match = new ArrayList<Element>(abbr);
		boolean tryLowerCase = false;
		if (MaryDomUtils.isFirstOfItsKindIn((Element) abbr.get(0), MaryXML.SENTENCE)
				&& REPattern.initialCapitalLetter.matcher(MaryDomUtils.tokenText((Element) abbr.get(0))).find()) {
			// At sentence start, maybe need to lowercase first char
			// before matching
			tryLowerCase = true;
		}
		StringBuilder sb = new StringBuilder();
		while (!match.isEmpty()) {
			sb.setLength(0);
			Iterator<Element> it = match.iterator();
			while (it.hasNext()) {
				sb.append(MaryDomUtils.tokenText((Element) it.next()));
			}
			logger.debug("Looking up abbreviation in dictionary: `" + sb.toString() + "'");
			if (abbrevDict.containsKey(sb.toString())) {
				break; // OK, found a match
			}
			if (tryLowerCase) {
				sb.setCharAt(0, Character.toLowerCase(sb.charAt(0)));
				logger.debug("Looking up abbreviation in dictionary: `" + sb.toString() + "'");
				if (abbrevDict.containsKey(sb.toString()))
					break; // OK, found a match
			}
			// Try to append a dot:
			sb.append(".");
			logger.debug("Looking up abbreviation in dictionary: `" + sb.toString() + "'");
			if (abbrevDict.containsKey(sb.toString())) {
				break; // OK, found a match
			}
			match.remove(match.size() - 1); // remove last in list
		}
		if (!match.isEmpty()) { // found an abbrevDict entry
			exp.addAll(dictionaryExpandAbbrev(match, sb.toString()));
			abbr.removeAll(match);
			logger.debug("Have found abbreviation in dictionary: `" + sb.toString() + "'");
		} else { // no abbrevDict entry - expand one token by rule
			Element token = (Element) abbr.get(0);
			// Verify that token consists of more than a single character:
			String text = MaryDomUtils.tokenText(token);
			// Only digits? Pronounce as an integer.
			if (REPattern.onlyDigits.matcher(text).find()) {
				if (Pattern.matches(NumberEP.sInteger, text)) {
					logger.debug("Expanding as integer: `" + text + "'");
					exp.addAll(makeNewTokens(token.getOwnerDocument(), number.expandInteger(text), true, text));
				} else {
					logger.debug("Expanding as digits: `" + text + "'");
					exp.addAll(makeNewTokens(token.getOwnerDocument(), number.expandDigits(text), true, text));
				}

			} else if (text.length() > 1) {
				logger.debug("Expanding one token by rule: `" + text + "'");
				// Slow down the mtu containing this token (or the token
				// itself), so the spelling is understandable.
				slowDown(token);
				// And now expand:
				exp.addAll(ruleExpandAbbrev(token));
			} else { // only single character
				// Need to copy the character into a new XML document,
				// otherwise replaceTokens will kill it.
				exp.addAll(makeNewTokens(token.getOwnerDocument(), text));
			}
			abbr.remove(0);
		}
		if (!abbr.isEmpty())
			exp.addAll(expandAbbrev(abbr));
		if (logger.getEffectiveLevel().equals(Level.DEBUG)) {
			StringBuilder logBuf = new StringBuilder();
			for (Iterator<Element> it = exp.iterator(); it.hasNext();) {
				Element elt = (Element) it.next();
				if (elt.getTagName().equals(MaryXML.TOKEN)) {
					logBuf.append(MaryDomUtils.tokenText(elt));
				} else {
					logBuf.append(elt.getTagName());
				}
				logBuf.append(" ");
			}
			logger.debug("Expanded abbreviation: " + logBuf.toString());
		}
		return exp;
	}

	/**
	 * Expand a recognised abbreviation from the dictionary. <code>match</code> is the list of token elements forming the
	 * abbreviation; <code>abbrev</code> is a string representation of that abbreviation. Tokens for the expanded form are
	 * created, but not yet attached to the dom tree.
	 * 
	 * @param match
	 *            match
	 * @param abbrev
	 *            abbrev
	 * @return exp
	 */
	private List<Element> dictionaryExpandAbbrev(List<Element> match, String abbrev) {
		Document doc = ((Element) match.get(0)).getOwnerDocument();
		ArrayList<Element> exp = new ArrayList<Element>();
		String[] value = (String[]) abbrevDict.get(abbrev);
		String flex = value[0]; // inflection info
		String graph = value[1]; // expanded form, possibly with pronunciation
		// For Sentence-initial abbreviation, make sure the expanded
		// form starts with a capital letter.
		if (MaryDomUtils.isFirstOfItsKindIn((Element) match.get(0), "div")
				&& REPattern.initialLowercaseLetter.matcher(graph).find()) {
			StringBuilder sb = new StringBuilder(graph);
			sb.setCharAt(0, Character.toUpperCase(sb.charAt(0)));
			graph = sb.toString();
			// And while we're at it, correct abbrev because we need it
			// for the mtu tag later.
			sb.setLength(0);
			sb.append(abbrev);
			sb.setCharAt(0, Character.toUpperCase(sb.charAt(0)));
			abbrev = sb.toString();
		}
		exp.addAll(makeNewTokens(doc, graph, true, abbrev));
		if (exp.isEmpty())
			return exp;
		// Now some post-hoc modification of the first expanded token:
		if (flex != null && flex.length() > 0) {
			Element t = (Element) exp.get(0);
			while (t != null && !t.getTagName().equals(MaryXML.TOKEN))
				t = MaryDomUtils.getFirstChildElement(t);
			if (t != null) {
				String firstWord = MaryDomUtils.tokenText(t);
				// First expanded word gets the inflection info
				// (usually only used in one-word expansions).
				t.setAttribute("ending", flex);
				// If we have an inflexion ending, keep the abbreviated form
				// as the graphemic form (for pos-tagger and chunker), and
				// save the expanded form in the `sounds_like' attribute.
				t.setAttribute("sounds_like", firstWord);
				MaryDomUtils.setTokenText(t, abbrev);
			}
		}
		return exp;
	}

	protected List<Element> ruleExpandAbbrev(Element token) {
		Document doc = token.getOwnerDocument();
		String orig = MaryDomUtils.tokenText(token);
		String expandedString = ruleExpandAbbrev(orig, false); // do not say specialChar
		// Force an accent on every item in this mtu,
		// just to make sure the token in the mtu whose accent
		// is to be retained actually *has* an accent.
		return makeNewTokens(doc, expandedString, true, orig, true);
	}

	protected String ruleExpandAbbrev(String orig, boolean saySpecialChar) {
		// Spell out if <= 5 letters:
		if (orig.indexOf('.') == -1 && // not dot and
				orig.length() <= 5 || // maximally five letters or
				orig.indexOf('.') == orig.length() - 1 && // final dot and
				orig.length() <= 6) { // maximally five letters + dot
			return spellOutAbbrev(orig, saySpecialChar);
		}
		// Contains dot or other specialChar:
		else if (specialChar.reMatchingChars().matcher(orig).find()) {
			StringBuilder sb = new StringBuilder();
			StringTokenizer st = new StringTokenizer(orig, specialChar.sMatchingCharsSimpleString, saySpecialChar);
			// If saySpecialChar is true, st will return specialChar signs
			// as individual tokens of length one.
			while (st.hasMoreTokens()) {
				// recursive call:
				sb.append(ruleExpandAbbrev(st.nextToken(), saySpecialChar));
				sb.append(" ");
			}
			return sb.toString().trim();
		} else {
			// Else, no dot and too long for spelling - print as is
			return orig;
		}
	}

	/**
	 * Spell out a token. If saySpecialChar is true, specialChar is pronounced as well; otherwise, it is silently ignored. The
	 * spelled out version is returned as a String in which individual tokens are separated by a space character.
	 * 
	 * @param s
	 *            s
	 * @param saySpecialChar
	 *            saySpecialChar
	 * @return sb.toString().trim()
	 */
	private String spellOutAbbrev(String s, boolean saySpecialChar) {
		if (s == null || s.length() == 0) // nothing to do
			return s;
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < s.length(); i++) {
			if (specialChar.matchSpecialChar(s.substring(i, i + 1))) {
				// A specialChar character
				if (saySpecialChar) {
					sb.append(specialChar.expandSpecialChar(s.substring(i, i + 1)));
					sb.append(" ");
				}
			} else { // a normal letter
				if ((i + 2 == s.length() || i + 2 < s.length() && specialChar.matchSpecialChar(s.substring(i + 2, i + 3)))
						&& Character.isUpperCase(s.charAt(i)) && s.charAt(i + 1) == 's') {
					// This is the second-to-last character, it is an uppercase
					// letter, and the last one is a lowercase 's' => treat the
					// s like a plural and attach it to this letter's
					// pronunciation.
					sb.append(s.substring(i, i + 1));
					sb.append("[*s]");
					i++;
				} else {
					sb.append(s.substring(i, i + 1));
					sb.append(" ");
				}
			}
		}
		return sb.toString().trim();
	}

	private static void loadAbbrevDict() throws FileNotFoundException, IOException {
		InputStream abbrevStream = AbbrevEP.class.getResourceAsStream("abbrev.dat");
		BufferedReader br = new BufferedReader(new InputStreamReader(abbrevStream, "UTF-8"));
		String line;
		while ((line = br.readLine()) != null) {
			if (Pattern.compile("^\\#").matcher(line).find() || REPattern.emptyLine.matcher(line).find()) {
				// comment or empty line, ignore
				continue;
			}
			// Fields separated by a slash (/):
			StringTokenizer st = new StringTokenizer(line, "/");
			// Each line contains three fields,
			// key (the abbreviation),
			// flex (optional inflection information),
			// and graph (the graphemic (and possibly phonemic) expanded form.
			// Remove leading/trailing whitespace from each field.
			String key = st.nextToken().trim();
			String flex = st.nextToken().trim();
			String graph = st.nextToken().trim();
			// In addition, replace all whitespace in graph by a single blank
			graph = graph.replaceAll("\\s+", " ");
			// Now key should not contain any whitespace:
			if (Pattern.compile("\\s").matcher(key).find()) {
				logger.info("In abbrev.dat: Abbreviation \"" + key + "\" contains whitespace. Ignoring.");
				continue;
			}
			// In the hashmap, save a reference to an array containing
			// two elements:
			// 1. The inflection information, if any, and
			// 2. The replacement token(s) as one string.
			String[] value = new String[2];
			value[0] = flex;
			value[1] = graph;
			abbrevDict.put(key, value);
		}
	}

}