/**
* Copyright 2002 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.de.preprocess;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import marytts.datatypes.MaryXML;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
/**
* An expansion pattern implementation for abbreviation patterns.
*
* @author Marc Schröder
*/
public class MultiWordEP extends ExpansionPattern {
private final String[] _knownTypes = { "multiword" };
private final List<String> knownTypes = Arrays.asList(_knownTypes);
public List<String> knownTypes() {
return knownTypes;
}
private static final Map<String, String> multiWordDict = new HashMap<String, String>();
private static final Set<String> constituentWordSet = new HashSet<String>();
// We don't use sMatchingChars here, but override isCandidate().
private final Pattern reMatchingChars = null;
public Pattern reMatchingChars() {
return reMatchingChars;
}
private static final Logger logger = MaryUtils.getLogger("MultiWordEP");
static {
try {
loadMultiWordDict();
} catch (FileNotFoundException e) {
logger.warn("Could not load abbreviation file", e);
} catch (IOException e) {
logger.warn("Could not load abbreviation file", e);
}
}
public MultiWordEP() {
super();
}
protected boolean isCandidate(Element t) {
String str = MaryDomUtils.tokenText(t);
return constituentWordSet.contains(str);
}
protected int canDealWith(String s, int type) {
return match(s, type);
}
protected int match(String s, int type) {
if (s.length() > 0)
return type;
return -1;
}
/**
* Expand multiwords and eventually replace them with <code>mtu</code> structures.
*
* @param tokens
* tokens
* @param s
* s
* @param type
* type
* @return expanded
*/
protected List<Element> expand(List<Element> tokens, String s, int type) {
if (tokens == null)
throw new NullPointerException("Received null argument");
if (tokens.isEmpty())
throw new IllegalArgumentException("Received empty list");
// Expand the list of potential multi-token words.
// First, try to find longest entries in database, then shorter.
List<Element> expanded = new ArrayList<Element>();
ArrayList<Element> match = new ArrayList<Element>(tokens);
StringBuilder sb = new StringBuilder();
String multiword = null;
while (!match.isEmpty()) {
sb.setLength(0);
Iterator<Element> it = match.iterator();
while (it.hasNext()) {
sb.append(MaryDomUtils.tokenText((Element) it.next()));
sb.append(" ");
}
String lookup = sb.toString().trim();
logger.debug("Looking up multiword in dictionary: `" + lookup + "'");
if (multiWordDict.containsKey(lookup)) {
multiword = lookup;
break; // OK, found a match
}
match.remove(match.size() - 1); // remove last in list
}
if (multiword != null) { // found a multiWordDict entry
expanded.addAll(dictionaryExpandMultiWord(match, multiword));
logger.debug("Have found multiword in dictionary: `" + multiword + "'");
}
if (logger.getEffectiveLevel().equals(Level.DEBUG)) {
StringBuilder logBuf = new StringBuilder();
for (Iterator<Element> it = expanded.iterator(); it.hasNext();) {
Element elt = (Element) it.next();
if (elt.getTagName().equals(MaryXML.TOKEN)) {
logBuf.append(MaryDomUtils.tokenText(elt));
} else {
logBuf.append(elt.getTagName());
}
logBuf.append(" ");
}
logger.debug("Expanded multiword: " + logBuf.toString());
}
if (!expanded.isEmpty())
replaceTokens(match, expanded);
return expanded;
}
/**
* Expand a recognised multiword from the dictionary. <code>match</code> is the list of token elements forming the multiword;
* <code>abbrev</code> is a string representation of that multiword. Tokens for the expanded form are created, but not yet
* attached to the dom tree.
*
* @param match
* match
* @param multiword
* multiword
* @return exp
*/
private List<Element> dictionaryExpandMultiWord(List<Element> match, String multiword) {
Document doc = ((Element) match.get(0)).getOwnerDocument();
ArrayList<Element> exp = new ArrayList<Element>();
String graph = (String) multiWordDict.get(multiword);
// graph = expanded form, possibly with pronunciation
exp.addAll(makeNewTokens(doc, graph, true, multiword));
return exp;
}
private static void loadMultiWordDict() throws FileNotFoundException, IOException {
InputStream mwStream = MultiWordEP.class.getResourceAsStream("multiword.dat");
BufferedReader br = new BufferedReader(new InputStreamReader(mwStream, "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
if (Pattern.compile("^\\#").matcher(line).find() || REPattern.emptyLine.matcher(line).find()) {
// comment or empty line, ignore
continue;
}
// Fields separated by a slash (/):
StringTokenizer st = new StringTokenizer(line, "/");
// Each line contains two fields,
// key (the abbreviation),
// and graph (the graphemic (and possibly phonemic) expanded form.
// Remove leading/trailing whitespace from each field.
String key = st.nextToken().trim();
String graph = st.nextToken().trim();
// In addition, replace all whitespace in key and graph by a single blank
key = key.replaceAll("\\s+", " ");
graph = graph.replaceAll("\\s+", " ");
multiWordDict.put(key, graph);
// In addition, make a note of all constituent words of key:
constituentWordSet.addAll(Arrays.asList(key.split(" ")));
}
}
}