/** * Copyright 2002-2008 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.te.phonemiser; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.CharBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; /** * Telugu letter to sound(LTS) module - It converts UTF8 graphemes to IT3 pronunciation * * @author sathish, adopted from student work by Jyotsna and Chandu */ public class TeluguLTS { private HashMap<String, String> UTF8toPhoneSymbols; private HashMap<String, String> UTF8toPhoneTypes; private ArrayList<String> listPhoneSym; private ArrayList<String> listPhoneTypes; /** * TeluguLTS constructor * * @param utf8toit3mapStream * utf8toit3mapStream * @throws IOException * IOException */ public TeluguLTS(InputStream utf8toit3mapStream) throws IOException { this.loadPhoneSymbolsAndTypes(utf8toit3mapStream); } /** * Get it3 pronunciation for a word * * @param word * word * @return getStringfromArrayList(listPhoneSym) * @throws IOException * IOException */ public String phonemise(String word) throws IOException { ArrayList<String> utf8CharList = readUTF8String(word); listPhoneSym = new ArrayList<String>(); listPhoneTypes = new ArrayList<String>(); Iterator<String> listrun = utf8CharList.iterator(); while (listrun.hasNext()) { String utf8Char = listrun.next(); String phoneSymbol = UTF8toPhoneSymbols.get(utf8Char); String phoneType = UTF8toPhoneTypes.get(utf8Char); if (phoneSymbol == null) phoneSymbol = getAsciiChar(utf8Char); if (phoneType == null) phoneType = "#"; listPhoneSym.add(phoneSymbol); listPhoneTypes.add(phoneType); // System.out.println(utf8Char+" "+phoneSymbol+" "+phoneType); } removeUnknownSymbols(); schwaHandler(); removeHal(); syllabify(); putStressMark(); return getStringfromArrayList(listPhoneSym); } /** * Add stress mark on first syllable * * @return listPhoneSym */ private ArrayList<String> putStressMark() { listPhoneSym.add(0, "'"); return listPhoneSym; } /** * Add syllable symbols at proper places */ private void syllabify() { for (int i = 0; i < listPhoneTypes.size(); i++) { if (isVowel(i)) { boolean isVowelLater = isVowelLater(i); boolean isNextSemiCon = isNextSemiConsonant(i); if (isVowelLater) { if (isNextSemiCon) { listPhoneSym.add(i + 2, "-"); listPhoneTypes.add(i + 2, "SYM"); } else { listPhoneSym.add(i + 1, "-"); listPhoneTypes.add(i + 1, "SYM"); } } } } } /** * Check whether the character is Vowel or not * * @param pos * pos * @return true if listPhoneTypes.get(pos).equals("VOW"), false otherwise */ private boolean isVowel(int pos) { if (listPhoneTypes.get(pos).equals("VOW")) { return true; } return false; } /** * Check whether the word has vowels after given position * * @param pos * pos * @return true if listPhoneTypes.get(i).equals("VOW"), false otherwise */ private boolean isVowelLater(int pos) { for (int i = (pos + 1); i < listPhoneTypes.size(); i++) { if (listPhoneTypes.get(i).equals("VOW")) { return true; } } return false; } /** * check next position is semiconsonant * * @param pos * pos * @return true listPhoneSym.get(pos + 1).equals("n:") || listPhoneSym.get(pos + 1).equals("a:"), false otherwise */ private boolean isNextSemiConsonant(int pos) { if ((pos + 1) >= listPhoneSym.size()) return false; if (listPhoneSym.get(pos + 1).equals("n:") || listPhoneSym.get(pos + 1).equals("a:")) { return true; } return false; } /** * Get a string from arraylist * * @param aList * aList * @return result in string format */ private String getStringfromArrayList(ArrayList<String> aList) { Iterator<String> listrun = aList.iterator(); StringBuilder result = new StringBuilder(); while (listrun.hasNext()) { result.append(listrun.next()); } return result.toString(); } /** * Hex-decimal representation for a given string * * @param ch * ch * @return hex */ private String toHex4(int ch) { String hex = Integer.toHexString(ch).toUpperCase(); switch (hex.length()) { case 3: return "0" + hex; case 2: return "00" + hex; case 1: return "000" + hex; default: return hex; } } private void loadPhoneSymbolsAndTypes(InputStream inStream) throws IOException { String line; BufferedReader bfr = new BufferedReader(new InputStreamReader(inStream, "UTF-8")); UTF8toPhoneSymbols = new HashMap<String, String>(); UTF8toPhoneTypes = new HashMap<String, String>(); while ((line = bfr.readLine()) != null) { String[] words = line.split("\\|"); UTF8toPhoneSymbols.put(words[0], words[1]); UTF8toPhoneTypes.put(words[0], words[2]); } bfr.close(); } public ArrayList<String> readUTF8String(String word) throws IOException { CharBuffer cbuf = CharBuffer.wrap(word); ArrayList<String> utf8CharList = new ArrayList<String>(); for (int i = 0; i < cbuf.length(); i++) { char ch = cbuf.get(i); utf8CharList.add(toHex4((int) ch)); } return utf8CharList; } public ArrayList<String> readUTF8File(String filename) throws IOException { int ch; ArrayList<String> utf8CharList = new ArrayList<String>(); InputStreamReader ins = new InputStreamReader(new FileInputStream(filename), "UTF8"); while ((ch = ins.read()) >= 0) { utf8CharList.add(toHex4(ch)); } return utf8CharList; } private void printData(String filename) throws IOException { ArrayList<String> utf8CharList = readUTF8File(filename); Iterator<String> listrun = utf8CharList.iterator(); while (listrun.hasNext()) { String utf8Char = listrun.next(); String phoneSymbol = UTF8toPhoneSymbols.get(utf8Char); String phoneType = UTF8toPhoneTypes.get(utf8Char); if (phoneSymbol == null) phoneSymbol = "SPACE"; if (phoneType == null) phoneType = "#"; System.out.println(utf8Char + " " + phoneSymbol + " " + phoneType); } } public void makeProperIt3(String filename) throws IOException { ArrayList<String> utf8CharList = readUTF8File(filename); ArrayList<String> lPhoneSym = new ArrayList<String>(); ArrayList<String> lPhoneTypes = new ArrayList<String>(); Iterator<String> listrun = utf8CharList.iterator(); while (listrun.hasNext()) { String utf8Char = listrun.next(); String phoneSymbol = UTF8toPhoneSymbols.get(utf8Char); String phoneType = UTF8toPhoneTypes.get(utf8Char); if (phoneSymbol == null) phoneSymbol = getAsciiChar(utf8Char); if (phoneType == null) phoneType = "#"; lPhoneSym.add(phoneSymbol); lPhoneTypes.add(phoneType); System.out.println(utf8Char + " " + phoneSymbol + " " + phoneType); } printArrayList(lPhoneSym); printArrayList(lPhoneTypes); lPhoneSym = schwaHandler(lPhoneSym, lPhoneTypes); lPhoneSym = removeHal(lPhoneSym, lPhoneTypes); printArrayList(lPhoneSym); } /** * Remove Halanth from telugu characters * * @param lPhoneSym * lPhoneSym * @param lPhoneTypes * lPhoneTypes * @return lPhoneSym */ private ArrayList<String> removeHal(ArrayList<String> lPhoneSym, ArrayList<String> lPhoneTypes) { for (int i = 0; i < lPhoneTypes.size(); i++) { if (lPhoneTypes.get(i).equals("HLT")) { lPhoneTypes.remove(i); lPhoneSym.remove(i); i--; } } return lPhoneSym; } /** * Remove Halanth from telugu characters */ private void removeHal() { for (int i = 0; i < listPhoneTypes.size(); i++) { if (listPhoneTypes.get(i).equals("HLT")) { listPhoneTypes.remove(i); listPhoneSym.remove(i); i--; } } } /** * Remove unknown symbols */ private void removeUnknownSymbols() { for (int i = 0; i < listPhoneTypes.size(); i++) { if (listPhoneTypes.get(i).equals("#")) { listPhoneTypes.remove(i); listPhoneSym.remove(i); i--; } } } /** * get ascii values for utf8 characters * * @param utf8Char * utf8Char * @return Character.toString(dec) */ private String getAsciiChar(String utf8Char) { int intValue = Integer.parseInt(utf8Char, 16); char dec = (char) intValue; return Character.toString(dec); } /** * Schwa handler for telugu * * @param lPhoneSym * lPhoneSym * @param lPhoneTypes * lPhoneTypes * @return lPhoneSym */ private ArrayList<String> schwaHandler(ArrayList<String> lPhoneSym, ArrayList<String> lPhoneTypes) { String prev, next; for (int i = 0; i < lPhoneTypes.size(); i++) { prev = lPhoneTypes.get(i); if ((i + 1) < lPhoneTypes.size()) { next = lPhoneTypes.get(i + 1); } else { next = lPhoneTypes.get(i); } if ((prev.equals("CON") && next.equals("CON")) || (prev.equals("CON") && next.equals("SYM")) || (prev.equals("CON") && next.equals("#"))) { lPhoneTypes.add(i + 1, "VOW"); lPhoneSym.add(i + 1, "a"); } } return lPhoneSym; } /** * Schwa handler for telugu */ private void schwaHandler() { String prev, next; for (int i = 0; i < listPhoneTypes.size(); i++) { // if(listPhoneTypes.get(i) == null) continue; // if(listPhoneTypes.get(i+1) == null) ; prev = listPhoneTypes.get(i); if ((i + 1) < listPhoneTypes.size()) { next = listPhoneTypes.get(i + 1); } else { next = listPhoneTypes.get(i); } if ((prev.equals("CON") && next.equals("CON")) || (prev.equals("CON") && next.equals("SYM")) || (prev.equals("CON") && next.equals("#"))) { listPhoneTypes.add(i + 1, "VOW"); listPhoneSym.add(i + 1, "a"); } } } /** * print array list * * @param aList * aList */ private void printArrayList(ArrayList<String> aList) { Iterator<String> listrun = aList.iterator(); System.out.println(); while (listrun.hasNext()) { // System.out.print(listrun.next()+" "); System.out.print(listrun.next()); } System.out.println(); } /** * @param args * args * @throws IOException * IOException */ public static void main(String[] args) throws IOException { TeluguLTS utf8r = new TeluguLTS(new FileInputStream("~/openmary/lib/modules/te/lexicon/UTF8phone.te.list")); // utf8r.makeProperIt3("/home/sathish/Desktop/telugu-utf8-txt.done.data"); // String nameString = "\u0C05\u0C38\u0C1F\u0C08\u0C05\u0C37"; // PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("telugu-utf.txt"), "UTF8")); // pw.print(nameString); // pw.flush(); // pw.close(); System.out.println("Result : " + utf8r.phonemise("ప్రకారం")); } }