package com.felix.util; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Iterator; import java.util.Locale; import java.util.Vector; import java.util.regex.Pattern; /** * Functionality to do a pattern matched and vocabulary based search and * replacement in text. Might be used as a preprocessor for text to speech * applications. The possibility to have different types of preprocessors for * specific use cases can be done by creating several instances. The * Preprocessors are configures by text files or string streams. The * configuration file (or stream) matches the type integers to vocabulary and * rule files where the replacements are specified. * * * @author felix * */ public class Preprocessor { private static Object context; private static KeyValues config; private static HashMap<Integer, Preprocessor> preprocessors; private final static String RULES_KEY_EXT = "_rules"; private final static String VOCAB_KEY_EXT = "_vocab"; private KeyValue[] _rules; private KeyValue[] _vocab; private static String _langId = ""; private Vector<Pattern> _rulePatterns; private boolean _useVocab = true; /** * <pre> * Get an instance of a preprocessor for a certain type. The types are * declared in the configuration file. * Here's an example of a config file; * TYPE_EMAIL=1 * TYPE_NEWS=2 * 1_rules.de=res/email_rules.txt * 2_rules.de=res/news_rules.txt * 1_vocab.de=res/email_vocab.txt * 2_vocab.de=res/news_vocab.txt * </pre> * * @param type * The type of the preprocessor. * @param configPath * The path to the configuration file. * @return The preprocessor. */ public static synchronized Preprocessor getInstance(int type, String configPath) { if (config == null) { loadConfig(configPath); } return setUpInstances(type); } public static synchronized Preprocessor getInstance(int type) { return setUpInstances(type); } /** * Get an instance with vocab and rulefle from specified path, omitting the * configuration file. This will be assigned type == 0. * * @param rulesFile * @param vocabFile * @return */ public static synchronized Preprocessor getInstance(String vocabFile, String rulesFile) { return setUpInstances(0, vocabFile, rulesFile); } /** * Get an instance with vocab and rulefle from specified path, omitting the * configuration file. This will be assigned type == 0. * * @param rulesFile * @param vocabFile * @param type * @return */ public static synchronized Preprocessor getInstance(int type, String vocabFile, String rulesFile) { return setUpInstances(type, vocabFile, rulesFile); } /** * Constructor with two file names for simple use cases not needing * preprocessors of multiple kinds and ignoring the Locale. * * @param rulesFile * @param vocabFile */ public Preprocessor(String rulesFile, String vocabFile) { try { _rules = readVocabData(rulesFile); if (vocabFile != null) _vocab = readVocabData(vocabFile); else _useVocab = false; buildRulePatterns(); } catch (Exception e) { System.err.println("problem reading " + vocabFile + " and/or " + rulesFile); } } /** * Der Preprozessor sollte dahingehend erweitert werden, das er * sprachspezifische Regeln verarbeiten kann. * * Beispiel: * * 8_mail_vocab 8_mail_vocab.de 8_mail_vocab.en * * Preprozessor-Klasse bietet Methode setLocale(java.util.Locale) mit der * festgelegt wird, welche Datei genutzt wird. Entweder eine vorhanden * Sprachspezifische oder die Default-Datei (ohne Sprachangabe) * * @param locale * The (global) Locale of all preprocessors. */ public static synchronized void setLocale(Locale locale) { _langId = "." + locale.getLanguage(); } /** * Get an instance of a preprocessor for a certain type. The types are * declared in the configuration stream. * * @param type * The type. * @param configStream * The configuration. * @return The Preprocessor. */ public static synchronized Preprocessor getInstance(int type, InputStream configStream) { if (config == null) { try { loadConfig(configStream); } catch (Exception e) { e.printStackTrace(); } } return setUpInstances(type); } /** * Get an instance of a preprocessor for a certain type. The types are * declared in the configuration stream. The context object is needed to * denote the application path. * * @param type * @param configStream * @param context * Unspecified context (e.g. Android or Axis). Used in * AndroidHelper. * @return */ public static synchronized Preprocessor getInstance(int type, InputStream configStream, Object context) { if (config == null) { Preprocessor.context = context; try { loadConfig(configStream); } catch (Exception e) { e.printStackTrace(); } } return setUpInstances(type); } /** * Search and replace patterns or vocabulary tokens like specified in the * configuration, * * @param input * The input String. * @return The processed String. */ public String process(String input) { if (input == null) { System.err.println("WARNING: Preprocessor: null input"); return ""; } try { String ret = processRules(input); if (_useVocab) return replaceVocab(ret); else return ret; } catch (Exception e) { e.printStackTrace(); } return null; } private static String getConfigString(String key) { return config.getString(key); } public static String getConfigPath(String key) { return config.getPathValue(key); } private static synchronized Preprocessor setUpInstances(int type) { // lookup ob typ schon gibt if (preprocessors == null) { // ansonsten neuen preprocessor f�r diesen typ initialisieren UND in // hashmap ablegen. preprocessors = new HashMap<Integer, Preprocessor>(0); } if (preprocessors.containsKey(Integer.valueOf(type))) { return preprocessors.get(Integer.valueOf(type)); } else { Preprocessor preprocessor = new Preprocessor(type); preprocessors.put(Integer.valueOf(type), preprocessor); return preprocessor; } } private static synchronized Preprocessor setUpInstances(int type, String vocabFile, String rulesFile) { // lookup ob typ schon gibt if (preprocessors == null) { // ansonsten neuen preprocessor für diesen typ initialisieren UND in // hashmap ablegen. preprocessors = new HashMap<Integer, Preprocessor>(0); } if (preprocessors.containsKey(type)) { return preprocessors.get(type); } else { Preprocessor preprocessor = new Preprocessor(type, vocabFile, rulesFile); preprocessors.put(type, preprocessor); return preprocessor; } } private Preprocessor(int type) { String vocabFile = Preprocessor.getConfigString(String.valueOf(type) + VOCAB_KEY_EXT + _langId); String rulesFile = Preprocessor.getConfigString(String.valueOf(type) + RULES_KEY_EXT + _langId); if (context == null) { try { _rules = readVocabData(rulesFile); _vocab = readVocabData(vocabFile); } catch (FileNotFoundException fnfe) { // try to get files without language extension vocabFile = Preprocessor.getConfigString(String.valueOf(type) + VOCAB_KEY_EXT); rulesFile = Preprocessor.getConfigString(String.valueOf(type) + RULES_KEY_EXT); try { _rules = readVocabData(rulesFile); _vocab = readVocabData(vocabFile); } catch (Exception e) { System.err.println(" problem openening file " + vocabFile + " and/or " + rulesFile); } } catch (Exception e) { System.err.println(" problem openening file " + vocabFile + " and/or " + rulesFile); } } else { if (rulesFile != null && AndroidHelper.isUsable) { InputStream is = AndroidHelper.getRessourceInputStream(context, rulesFile); try { _rules = readVocabData(is); } catch (Exception e) { try { // try to get files without language extension rulesFile = Preprocessor.getConfigString(String .valueOf(type) + RULES_KEY_EXT); is = AndroidHelper.getRessourceInputStream(context, rulesFile); _rules = readVocabData(is); } catch (Exception e2) { e.printStackTrace(); System.err .println(" problem opening file " + rulesFile); } } if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } if (vocabFile != null && AndroidHelper.isUsable) { InputStream is = AndroidHelper.getRessourceInputStream(context, vocabFile); try { _vocab = readVocabData(is); } catch (Exception e) { try { // try to get files without language extension vocabFile = Preprocessor.getConfigString(String .valueOf(type) + VOCAB_KEY_EXT); is = AndroidHelper.getRessourceInputStream(context, vocabFile); _vocab = readVocabData(is); } catch (Exception e2) { e.printStackTrace(); System.err.println(" problem openening file " + vocabFile); } } if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } } buildRulePatterns(); } private Preprocessor(int type, String vocFile, String rulFile) { String vocabFile = null; if (vocFile != null) vocabFile = vocFile + _langId; String rulesFile = rulFile + _langId; try { _rules = readVocabData(rulesFile); if (vocFile != null) _vocab = readVocabData(vocabFile); else _useVocab = false; } catch (FileNotFoundException fnfe) { // try to get files without language extension if (vocFile != null) vocabFile = Preprocessor.getConfigString(String.valueOf(type) + VOCAB_KEY_EXT); rulesFile = Preprocessor.getConfigString(String.valueOf(type) + RULES_KEY_EXT); try { _rules = readVocabData(rulesFile); if (vocFile != null) _vocab = readVocabData(vocabFile); else _useVocab = false; } catch (Exception e) { System.err.println(" problem openening file " + vocabFile + " and/or " + rulesFile); } } catch (Exception e) { System.err.println(" problem openening file " + vocabFile + " and/or " + rulesFile); } buildRulePatterns(); } private void buildRulePatterns() { _rulePatterns = new Vector<Pattern>(); if (_rules == null) return; try { for (KeyValue rule : _rules) { _rulePatterns.add(Pattern.compile(rule.getKey())); } } catch (Exception e) { e.printStackTrace(); } } private String processRules(String input) { if (_rules == null) { return input; } String temp = input; int i = 0; try { for (Pattern rulePattern : _rulePatterns) { KeyValue srcDest = _rules[i++]; temp = rulePattern.matcher(temp).replaceAll(srcDest.getValue()); } } catch (IndexOutOfBoundsException ie) { return temp; } catch (Exception e) { e.printStackTrace(); } return temp; } public String processRulesWithoutPatterns(String input) { if (_rules == null) { return input; } String temp = input; try { for (int i = 0; i < _rules.length; i++) { KeyValue srcDest = _rules[i]; temp = temp.replaceAll(srcDest.getKey(), srcDest.getValue()); } } catch (Exception e) { e.printStackTrace(); } return temp; } private String replaceVocab(String input) { if (_vocab == null) { return input; } String temp = input; try { for (int i = 0; i < _vocab.length; i++) { KeyValue srcDest = _vocab[i]; String orig = srcDest.getKey(); String replace = srcDest.getValue(); temp = StringUtil.replaceAll(temp, orig, replace); } } catch (Exception e) { e.printStackTrace(); } return temp; } private static synchronized void loadConfig(String configPath) { config = new KeyValues(new File(configPath), "=", FileUtil.ENCODING_UTF_8); } private static synchronized void loadConfig(InputStream configStream) throws Exception { config = new KeyValues(configStream, "=", FileUtil.ENCODING_UTF_8); } private KeyValue[] readVocabData(String filename) throws Exception { KeyValues keyValues = new KeyValues(new File(filename), "=", FileUtil.ENCODING_UTF_8); return keyValues.getKeyValues(); } private KeyValue[] readVocabData(InputStream stream) throws Exception { KeyValues keyValues = new KeyValues(stream, "=", FileUtil.ENCODING_UTF_8); return keyValues.getKeyValues(); } public static void main(String[] args) { if (args.length == 0) { // String testString // =" (sa)tz mit apfel ,WG: das ist mein gg afgel <beispiel\n> satz mit apfel ,na so.was [und] er wiederholt sich mit apfel ,na sowas."; String testString = " [*] => blablubb<ref bla>huhu</ref>"; Preprocessor test = Preprocessor.getInstance(1, "res/raw/preprocessor.properties"); System.out.println(testString); System.out.println(test.process(testString)); } else if (args.length == 4) { Preprocessor p = new Preprocessor(args[1], args[2]); try { Vector<String> fileLines = FileUtil.getFileLines(args[3]); for (Iterator<String> iterator = fileLines.iterator(); iterator .hasNext();) { String string = (String) iterator.next(); if (string.trim().length() > 0) { System.out.println(p.process(string)); } } } catch (Exception e) { e.printStackTrace(); } } else { String usage = "usage: Preprocessor <rules file> <vocab file> <input file>"; System.out.println(usage); } } }