/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.de.preprocess; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.regex.Pattern; import marytts.datatypes.MaryXML; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.w3c.dom.Document; import org.w3c.dom.Element; /** * An expansion pattern implementation for abbreviation patterns. * * @author Marc Schröder */ public class MultiWordEP extends ExpansionPattern { private final String[] _knownTypes = { "multiword" }; private final List<String> knownTypes = Arrays.asList(_knownTypes); public List<String> knownTypes() { return knownTypes; } private static final Map<String, String> multiWordDict = new HashMap<String, String>(); private static final Set<String> constituentWordSet = new HashSet<String>(); // We don't use sMatchingChars here, but override isCandidate(). private final Pattern reMatchingChars = null; public Pattern reMatchingChars() { return reMatchingChars; } private static final Logger logger = MaryUtils.getLogger("MultiWordEP"); static { try { loadMultiWordDict(); } catch (FileNotFoundException e) { logger.warn("Could not load abbreviation file", e); } catch (IOException e) { logger.warn("Could not load abbreviation file", e); } } public MultiWordEP() { super(); } protected boolean isCandidate(Element t) { String str = MaryDomUtils.tokenText(t); return constituentWordSet.contains(str); } protected int canDealWith(String s, int type) { return match(s, type); } protected int match(String s, int type) { if (s.length() > 0) return type; return -1; } /** * Expand multiwords and eventually replace them with <code>mtu</code> structures. * * @param tokens * tokens * @param s * s * @param type * type * @return expanded */ protected List<Element> expand(List<Element> tokens, String s, int type) { if (tokens == null) throw new NullPointerException("Received null argument"); if (tokens.isEmpty()) throw new IllegalArgumentException("Received empty list"); // Expand the list of potential multi-token words. // First, try to find longest entries in database, then shorter. List<Element> expanded = new ArrayList<Element>(); ArrayList<Element> match = new ArrayList<Element>(tokens); StringBuilder sb = new StringBuilder(); String multiword = null; while (!match.isEmpty()) { sb.setLength(0); Iterator<Element> it = match.iterator(); while (it.hasNext()) { sb.append(MaryDomUtils.tokenText((Element) it.next())); sb.append(" "); } String lookup = sb.toString().trim(); logger.debug("Looking up multiword in dictionary: `" + lookup + "'"); if (multiWordDict.containsKey(lookup)) { multiword = lookup; break; // OK, found a match } match.remove(match.size() - 1); // remove last in list } if (multiword != null) { // found a multiWordDict entry expanded.addAll(dictionaryExpandMultiWord(match, multiword)); logger.debug("Have found multiword in dictionary: `" + multiword + "'"); } if (logger.getEffectiveLevel().equals(Level.DEBUG)) { StringBuilder logBuf = new StringBuilder(); for (Iterator<Element> it = expanded.iterator(); it.hasNext();) { Element elt = (Element) it.next(); if (elt.getTagName().equals(MaryXML.TOKEN)) { logBuf.append(MaryDomUtils.tokenText(elt)); } else { logBuf.append(elt.getTagName()); } logBuf.append(" "); } logger.debug("Expanded multiword: " + logBuf.toString()); } if (!expanded.isEmpty()) replaceTokens(match, expanded); return expanded; } /** * Expand a recognised multiword from the dictionary. <code>match</code> is the list of token elements forming the multiword; * <code>abbrev</code> is a string representation of that multiword. Tokens for the expanded form are created, but not yet * attached to the dom tree. * * @param match * match * @param multiword * multiword * @return exp */ private List<Element> dictionaryExpandMultiWord(List<Element> match, String multiword) { Document doc = ((Element) match.get(0)).getOwnerDocument(); ArrayList<Element> exp = new ArrayList<Element>(); String graph = (String) multiWordDict.get(multiword); // graph = expanded form, possibly with pronunciation exp.addAll(makeNewTokens(doc, graph, true, multiword)); return exp; } private static void loadMultiWordDict() throws FileNotFoundException, IOException { InputStream mwStream = MultiWordEP.class.getResourceAsStream("multiword.dat"); BufferedReader br = new BufferedReader(new InputStreamReader(mwStream, "UTF-8")); String line; while ((line = br.readLine()) != null) { if (Pattern.compile("^\\#").matcher(line).find() || REPattern.emptyLine.matcher(line).find()) { // comment or empty line, ignore continue; } // Fields separated by a slash (/): StringTokenizer st = new StringTokenizer(line, "/"); // Each line contains two fields, // key (the abbreviation), // and graph (the graphemic (and possibly phonemic) expanded form. // Remove leading/trailing whitespace from each field. String key = st.nextToken().trim(); String graph = st.nextToken().trim(); // In addition, replace all whitespace in key and graph by a single blank key = key.replaceAll("\\s+", " "); graph = graph.replaceAll("\\s+", " "); multiWordDict.put(key, graph); // In addition, make a note of all constituent words of key: constituentWordSet.addAll(Arrays.asList(key.split(" "))); } } }