package com.cognitionis.nlp_files; import java.io.*; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * TransduceRulelistFile consists instances like patterns(one or more * words)|Transduced value Example: c_Card c_Month * c_Card|day=$1,month=$2,year=$3 * * As in Regex $n are used for replacements, they correspond to values in the * pattern * * There can be also conditions like c_Month * c_Card|IF($2<32){day=$2,month=$1}ELSE{month=$1,year=$2} * * IMPORTANT: Order matters! LONGER PHRASES MUST APPEAR FIRST. * * @author Héctor Llorens * @since 2011 */ public class TransduceRulelistFile extends NLPFile { private String name; private HashMap<String, String> map; // if some other type is needed you can transform it at run-time (dynamic casting is complicated and makes things complicate) private HashSet<String> keyset; // added for efficiency ONLY. Equivalent to map.keySet(); private String re; // regular expression public TransduceRulelistFile(String filename) { super(filename); name = "c_" + this.f.getName().substring(0, this.f.getName().lastIndexOf(".")).toLowerCase(); re = "_no_regex_to_match_"; map = new HashMap(); keyset = null; isWellFormatted(); // good format is mandatory, this loads map<String,String> and re by default } @Override public Boolean isWellFormatted() { try { if (super.getFile() == null) { throw new Exception("No file loaded in NLPFile object"); } if (!encoding.equalsIgnoreCase("UTF-8") && !encoding.equalsIgnoreCase("ASCII")) { throw new Exception("\n\tError: Only ASCII/UTF-8 text is allowed. " + this.f.getName() + " is " + encoding + "\n"); } try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(this.f), "UTF-8"))) { Boolean checked = false; String line; int linen = 0; while ((line = reader.readLine()) != null) { line = line.trim(); linen++; if (line.length() != 0) { if (!checked) { if (line.matches("^.+\\|[^\\|]*$")) { re = "(" + line.substring(0, line.lastIndexOf("|")); } else { throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Expected | since TranduceRulelist requires it"); } checked = true; } else { if (!line.contains("|")) { throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Expected | since TranduceRulelist requires it"); } else { re += "|" + line.substring(0, line.lastIndexOf("|")); } } } if (line.length() != 0) { String key = line.substring(0, line.lastIndexOf("|")); String value = line.substring(line.lastIndexOf("|") + 1); if (key.matches(".*[\\(\\)\\[\\]].*")) { /* * Strategy given nesting is not allowed is just extend one by one */ Pattern p = Pattern.compile(key); // this will check if the ( and [ can be parsed (closed, ...) if (key.contains("*") || key.contains("+") || key.contains("\\")) { throw new Exception("Symbols * + \\ are not supported."); } if (key.matches(".*[\\(\\[][^\\)\\]]*[\\(\\[].*")) { throw new Exception("Nesting in rule patterns is not supported."); // to support it we need to split only | outside other groups } ArrayList<String> rules = expand_rules(line); // line and not key because value also needs to be copied for (String rule : rules) { key = rule.substring(0, rule.lastIndexOf("|")); value = rule.substring(rule.lastIndexOf("|") + 1); //System.out.println(rule); addToMap(key, value, rule, linen); } } else { addToMap(key, value, line, linen); } } } if (checked) { re += ")"; //re=re.replaceAll("\\.", "\\\\."); // this would be a solution to allow dots keyset = new HashSet<>(map.keySet()); // Check for multi-word ambiguity (partial match): can be done lively since longest first can be allowed } } } catch (Exception e) { if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } else { System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n"); } this.isWellFormatted = false; return false; } this.isWellFormatted = true; return true; } public ArrayList<String> expand_rules(String rule) { ArrayList<String> expanded_rules = new ArrayList<>(); //for(String rule : rules){ String key = rule.substring(0, rule.lastIndexOf("|")); String value = rule.substring(rule.lastIndexOf("|") + 1); // () if (key.contains("(")) { Pattern p = Pattern.compile("\\([^\\(]+\\)"); Matcher m = p.matcher(key); if (m.find()) { String elem = m.group(); String beforeelem = ""; String afterelem = ""; String newvalue; if (m.start() > 0) { beforeelem = key.substring(0, m.start()); } if (m.end() < key.length()) { afterelem = key.substring(m.end()); } int tokens_before_elem = beforeelem.trim().split(" ").length; // even if empty 0 makes no sense since $ always starts with 1 int tokens = key.trim().split(" ").length; //System.out.print("Start index: " + m.start()); //System.out.print(" End index: " + m.end() + " "); //System.out.println(elem); String[] options = elem.substring(1, elem.length() - 1).split("\\|"); int longest_option_tokens = 1; for (String option : options) { int option_tokens = option.trim().split(" ").length; if (option_tokens > longest_option_tokens) { longest_option_tokens = option_tokens; } } for (String option : options) { newvalue = value; int option_tokens = option.trim().split(" ").length; int diff = longest_option_tokens - option_tokens; if (diff != 0 && value.contains("$")) { for (int i = tokens_before_elem; i < tokens; i++) { newvalue = newvalue.replaceAll("\\$" + i, "\\$" + (i - diff)); } } //System.out.println(beforeelem + option + afterelem + "|" + newvalue); expanded_rules.addAll(expand_rules(beforeelem + option + afterelem + "|" + newvalue)); } } //key=key.replaceAll("[\\(\\)]", "_caca_"); } else { // [] if (key.contains("[")) { Pattern p = Pattern.compile("\\[[^\\[]+\\]"); Matcher m = p.matcher(key); if (m.find()) { String elem = m.group(); String beforeelem = ""; String afterelem = ""; String newvalue; if (m.start() > 0) { beforeelem = key.substring(0, m.start()); } if (m.end() < key.length()) { afterelem = key.substring(m.end()); } int tokens_before_elem = beforeelem.trim().split(" ").length; // even if empty 0 makes no sense since $ always starts with 1 int tokens = key.trim().split(" ").length; //System.out.print("Start index: " + m.start()); //System.out.print(" End index: " + m.end() + " "); //System.out.println(elem); String[] options = elem.substring(1, elem.length() - 1).split("\\|"); int longest_option_tokens = 1; for (String option : options) { int option_tokens = option.trim().split(" ").length; if (option_tokens > longest_option_tokens) { longest_option_tokens = option_tokens; } } for (String option : options) { newvalue = value; int option_tokens = option.trim().split(" ").length; int diff = longest_option_tokens - option_tokens; if (diff != 0 && value.contains("$")) { for (int i = tokens_before_elem; i < tokens; i++) { newvalue = newvalue.replaceAll("\\$" + i, "\\$" + (i - diff)); } } //System.out.println(beforeelem + option + afterelem + "|" + newvalue); expanded_rules.addAll(expand_rules(beforeelem + option + afterelem + "|" + newvalue)); } // add the empty option newvalue = value; int diff = longest_option_tokens; if (diff != 0 && value.contains("$")) { for (int i = tokens_before_elem; i < tokens; i++) { newvalue = newvalue.replaceAll("\\$" + i, "\\$" + (i - diff)); } } //System.out.println(beforeelem + afterelem + "|" + newvalue); expanded_rules.addAll(expand_rules(beforeelem.trim() + afterelem + "|" + newvalue)); } } else { expanded_rules.add(rule); } } //} return expanded_rules; } public void addToMap(String key, String value, String line, int linen) throws Exception { if (map.containsKey(key)) { throw new Exception(this.f.getName() + ". Line " + linen + " (" + line + "): Repeated phrase. Phraselists must not contain repetitions."); } if (value.length() != 0) { map.put(key.trim(), value.trim()); } else { map.put(key.trim(), key.trim()); } } @Override public String toPlain(String filename) { throw new UnsupportedOperationException("toPlain not applicable to this type of file"); } public HashMap<String, String> getMap() { return map; } public HashSet<String> keySet() { return keyset; } public String getMapValue(String key) { return map.get(key); } public String getRE() { return re; } public String getName() { return name; } public HashSet<String> intersectPhraselist(HashSet s) { /*MANUAL METHOD: HashSet<String> contained=new HashSet<>();for(String k: map.keySet()){ if(s.contains(k)){ contained.add(k); } }*/ HashSet<String> intersection = new HashSet<>(keyset); // create a set to do intersecion intersection.retainAll(s); // java standard for set intersection return intersection; } }