/* Copyright (C) 2003 Pierrick Brihaye pierrick.brihaye@wanadoo.fr Original Perl code : Portions (c) 2002 QAMUS LLC (www.qamus.org), (c) 2002 Trustees of the University of Pennsylvania This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc. 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA or connect to: http://www.fsf.org/copyleft/gpl.html */ package marmot.thirdparty.aramorph; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** An in-memory dictionary of prefixes, stems, suffixes and combinations fed with * resources avalaible in the classpath. * TODO : use a Lucene index ;-) or any other fast-access resources. *@author Pierrick Brihaye, 2003 */ class DictionaryHandler { /** Dictionary of prefixes */ private Map<String, List<DictionaryEntry>> prefixes; /** Dictionary of stems */ private Map<String, List<DictionaryEntry>> stems; /** Dictionary of suffixes */ private Map<String, List<DictionaryEntry>> suffixes; /** Compatibility table for prefixes-stems combinations. * TODO : definitely not the best container */ private Set<String> hash_AB; /** Compatibility table for prefixes-suffixes combinations. * TODO : definitely not the best container */ private Set<String> hash_AC; /** Compatibility table for stems-suffixes combinations. * TODO : definitely not the best container */ private Set<String> hash_BC; /** Private constructor to avoid multiple instanciations. */ public DictionaryHandler() { //System.out.println("Initializing in-memory dictionary handler..."); // load 3 lexicons prefixes = loadDictionary("dictPrefixes", this.getClass().getResourceAsStream("dictionaries/dictPrefixes")); stems = loadDictionary("dictStems", this.getClass().getResourceAsStream("dictionaries/dictStems")); suffixes = loadDictionary("dictSuffixes", this.getClass().getResourceAsStream("dictionaries/dictSuffixes")); //load 3 compatibility tables hash_AB = loadCompatibilityTable("tableAB", this.getClass().getResourceAsStream("dictionaries/tableAB")); hash_AC = loadCompatibilityTable("tableAC", this.getClass().getResourceAsStream("dictionaries/tableAC")); hash_BC = loadCompatibilityTable("tableBC", this.getClass().getResourceAsStream("dictionaries/tableBC")); //System.out.println("... done."); }; /** Returns an iterator on the solutions for the given prefix. * @param translitered The prefix * @return The iterator */ protected Collection<DictionaryEntry> getPrefixIterator(String translitered) { return prefixes.get(translitered); } /** Returns an iterator on the solutions for the given stem. * @param translitered The stem * @return The iterator */ protected Collection<DictionaryEntry> getStemIterator(String translitered) { return stems.get(translitered); } /** Returns an iterator on the solutions for the given suffix. * @param translitered The suffix * @return The iterator */ protected Collection<DictionaryEntry> getSuffixIterator(String translitered) { return suffixes.get(translitered); } /** Whether or not the prefix/stem combination is possible. * @param AB The prefix and stem combination. * @return The result */ protected boolean hasAB(String A, String B) { return hash_AB.contains(A + " " + B); } /** Whether or not the prefix/suffix combination is possible. * @param AC The prefix and suffix combination. * @return The result */ protected boolean hasAC(String A, String C) { return hash_AC.contains(A + " " + C); } /** Whether or not the stem/suffix combination is possible. * @param BC The stem and suffix combination. * @return The result */ protected boolean hasBC(String B, String C) { return hash_BC.contains(B + " " + C); } /** Loads a dictionary into a <CODE>Set</CODE> where the <PRE>key</PRE> is entry and its <PRE>value</PRE> is a * <CODE>List</CODE> (each entry can have multiple values) * @param set The set * @param name A human-readable name * @param is The stream * @throws RuntimeException If a problem occurs when reading the dictionary */ private Map<String, List<DictionaryEntry>> loadDictionary(String name, InputStream is) throws RuntimeException { //TODO : should be static Map<String, List<DictionaryEntry>> set = new HashMap<String, List<DictionaryEntry>>(); Set<String> lemmas = new HashSet<String>(); String lemmaID = ""; try { LineNumberReader IN = new LineNumberReader(new InputStreamReader(is,"ISO8859_1")); String line = null; while ((line = IN.readLine()) != null) { // new lemma if (line.startsWith(";; ")) { lemmaID = line.substring(3); // lemmaID's must be unique if (lemmas.contains(lemmaID)) throw new RuntimeException("Lemma " + lemmaID + "in " + name + " (line " + IN.getLineNumber() + ") isn't unique"); lemmas.add(lemmaID); } // comment else if (line.startsWith(";")) {} else { String split[] = line.split("\t",-1); //-1 to avoid triming of trail values //a little error-checking won't hurt : if (split.length != 4) { throw new RuntimeException("Entry in " + name + " (line " + IN.getLineNumber() + ") doesn't have 4 fields (3 tabs)"); } String entry = split[0]; // get the entry for use as key String vocalization = split[1]; String morphology = split[2]; String glossPOS = split[3]; String gloss; String POS; Pattern p; Matcher m; // two ways to get the POS info: // (1) explicitly, by extracting it from the gloss field: p = Pattern.compile(".*" + "<pos>(.+?)</pos>" + ".*"); m = p.matcher(glossPOS); if (m.matches()) { POS = m.group(1); //extract POS from glossPOS gloss = glossPOS; //we clean up the gloss later (see below) } // (2) by deduction: use the morphology (and sometimes the voc and gloss) to deduce the appropriate POS else { // we need the gloss to guess proper names gloss = glossPOS; // null prefix or suffix if (morphology.matches("^(Pref-0|Suff-0)$")) { POS = ""; } else if (morphology.matches("^F" + ".*")) { POS = vocalization + "/FUNC_WORD"; } else if (morphology.matches("^IV" + ".*")) { POS = vocalization + "/VERB_IMPERFECT"; } else if (morphology.matches("^PV" + ".*")) { POS = vocalization + "/VERB_PERFECT"; } else if (morphology.matches("^CV" + ".*")) { POS = vocalization + "/VERB_IMPERATIVE"; } else if (morphology.matches("^N" + ".*")) { // educated guess (99% correct) if (gloss.matches("^[A-Z]" + ".*")) { POS = vocalization + "/NOUN_PROP"; } // (was NOUN_ADJ: some of these are really ADJ's and need to be tagged manually) else if (vocalization.matches(".*" + "iy~$")) { POS = vocalization + "/NOUN"; } else POS = vocalization + "/NOUN"; } else { throw new RuntimeException("No POS can be deduced in " + name + " (line " + IN.getLineNumber() + ")"); } } // clean up the gloss: remove POS info and extra space, and convert upper-ASCII to lower (it doesn't convert well to UTF-8) gloss = gloss.replaceFirst("<pos>.+?</pos>",""); gloss = gloss.trim(); //TODO : we definitely need a translate() method in the java packages ! gloss = gloss.replaceAll(";","/"); //TODO : is it necessary ? gloss = gloss.replaceAll("�","A"); gloss = gloss.replaceAll("�","A"); gloss = gloss.replaceAll("�","A"); gloss = gloss.replaceAll("�","A"); gloss = gloss.replaceAll("�","A"); gloss = gloss.replaceAll("�","A"); gloss = gloss.replaceAll("�","C"); gloss = gloss.replaceAll("�","E"); gloss = gloss.replaceAll("�","E"); gloss = gloss.replaceAll("�","E"); gloss = gloss.replaceAll("�","E"); gloss = gloss.replaceAll("�","I"); gloss = gloss.replaceAll("�","I"); gloss = gloss.replaceAll("�","I"); gloss = gloss.replaceAll("�","I"); gloss = gloss.replaceAll("�","N"); gloss = gloss.replaceAll("�","O"); gloss = gloss.replaceAll("�","O"); gloss = gloss.replaceAll("�","O"); gloss = gloss.replaceAll("�","O"); gloss = gloss.replaceAll("�","O"); gloss = gloss.replaceAll("�","U"); gloss = gloss.replaceAll("�","U"); gloss = gloss.replaceAll("�","U"); gloss = gloss.replaceAll("�","U"); gloss = gloss.replaceAll("�","a"); gloss = gloss.replaceAll("�","a"); gloss = gloss.replaceAll("�","a"); gloss = gloss.replaceAll("�","a"); gloss = gloss.replaceAll("�","a"); gloss = gloss.replaceAll("�","a"); gloss = gloss.replaceAll("�","c"); gloss = gloss.replaceAll("�","e"); gloss = gloss.replaceAll("�","e"); gloss = gloss.replaceAll("�","e"); gloss = gloss.replaceAll("�","e"); gloss = gloss.replaceAll("�","i"); gloss = gloss.replaceAll("�","i"); gloss = gloss.replaceAll("�","i"); gloss = gloss.replaceAll("�","i"); gloss = gloss.replaceAll("�","n"); gloss = gloss.replaceAll("�","o"); gloss = gloss.replaceAll("�","o"); gloss = gloss.replaceAll("�","o"); gloss = gloss.replaceAll("�","o"); gloss = gloss.replaceAll("�","o"); gloss = gloss.replaceAll("�","u"); gloss = gloss.replaceAll("�","u"); gloss = gloss.replaceAll("�","u"); gloss = gloss.replaceAll("�","u"); gloss = gloss.replaceAll("�","AE"); gloss = gloss.replaceAll("�","Sh"); gloss = gloss.replaceAll("�","Zh"); gloss = gloss.replaceAll("�","ss"); gloss = gloss.replaceAll("�","ae"); gloss = gloss.replaceAll("�","sh"); gloss = gloss.replaceAll("�","zh"); // note that although we read 4 fields from the dict we now save 5 fields in the hash table // because the info in last field, glossPOS, was split into two: gloss and POS DictionaryEntry de = new DictionaryEntry(entry, lemmaID, vocalization, morphology, gloss, POS); List<DictionaryEntry> list = set.get(entry); if (list == null) { list = new LinkedList<DictionaryEntry>(); set.put(entry, list); } list.add(de); } } IN.close(); } catch (IOException e) { throw new RuntimeException("Can not open : " + name); } return set; } /** Loads a compatibility table into a <CODE>Set</CODE>. * @param set The set * @param name A human-readable name * @param is The stream * @throws RuntimeException If a problem occurs when reading the compatibility table */ private Set<String> loadCompatibilityTable(String name, InputStream is) throws RuntimeException { Set<String> set = new HashSet<String>(); try { LineNumberReader IN = new LineNumberReader(new InputStreamReader(is,"ISO8859_1")); String line = null; while ((line = IN.readLine()) != null) { if (!line.startsWith(";")) { //Ignore comments line = line.trim(); line = line.replaceAll("\\s+", " "); set.add(line); } } IN.close(); } catch (IOException e) { throw new RuntimeException(e); } return set; } }