package com.cognitionis.nlp_knowledge.time;
import com.cognitionis.nlp_files.TransduceRulelistFile;
import com.cognitionis.utils_basickit.FileUtils;
import java.io.File;
import java.util.*;
import java.util.regex.*;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
/**
* Timex Normalization: Given an input string containing a temporal expression
* normalize() method obtains its normalized version (canonical).
*
* There is two main options to do this using external
* knowledge files (entities) plus rules (grammars) 1) Do it as a grammar
* parsing in one step: 2) Do it as pre-processing+rule-matching in 2 steps:
* This is the strategy chosen in this class.
*
* @author Hector_Llorens
*/
public class TimexNormalizer {
private Locale locale;
private Timek timek;
private TransduceRulelistFile rules;
public TimexNormalizer() {
this(Locale.getDefault());
}
public TimexNormalizer(Locale l) {
locale = l;
timek = new Timek(l);
String lang = locale.toString().replace('_', '-');
String extra = "";
if (File.separator.equals("\\")) {
extra = "\\";
}
String app_path = FileUtils.getApplicationPath(TimexNormalizer.class).replaceAll(extra + File.separator + "classes", "");
String res_path = app_path + File.separator + "resources" + File.separator + "time" + File.separator + lang + File.separator;
rules = new TransduceRulelistFile(res_path + "rules.phraselist");
}
public String normalize(String expr) {
// search by pattern first
// then if more than one is found
// filter by other features
// period... recursive pattern? NO
// what happens with modifiers (almost, aproximately, ...)
// default normalization when no pattern is found
// in patterns, is it important to distinguish card from ordinal_units?
String norm_value = "default_norm";
try {
if (expr == null) {
throw new Exception("Input expression is null.");
}
expr = expr.trim();
if (expr.length() == 0) {
throw new Exception("Input expression is empty.");
}
String normTextandPattern = timek.getNormTextandPattern(expr);
if (normTextandPattern == null) {
throw new Exception("Problem obtaining NormText and Pattern from: " + expr);
}
String[] normTextandPattern_arr = normTextandPattern.split("\\|");
String timex_text = normTextandPattern_arr[0];
String timex_pattern = normTextandPattern_arr[1];
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("\n\ntimex:" + expr + " normtext:" + timex_text + " pattern:" + timex_pattern + "\nfound rules:");
}
if (timex_pattern.matches("(?i)" + rules.getRE())) {
String rule = rules.getMapValue(timex_pattern); //return timex_text.replaceAll("([^ ]+( |$))+", rule);
return applyRule(timex_text, timex_pattern, rule);
} else {
// This can be more sophiscticated with pattern, find...
String left_cycle_pat = timex_pattern;
String right_cycle_pat = timex_pattern;
String left_cycle_text = timex_text;
String right_cycle_text = timex_text;
while (right_cycle_pat.split(" ").length > 2) {
left_cycle_pat = left_cycle_pat.substring(left_cycle_pat.indexOf(' ') + 1);
right_cycle_pat = right_cycle_pat.substring(0, right_cycle_pat.lastIndexOf(' '));
String med_cycle_pat = left_cycle_pat.substring(0, left_cycle_pat.lastIndexOf(' '));
left_cycle_text = left_cycle_text.substring(left_cycle_text.indexOf(' ') + 1);
right_cycle_text = right_cycle_text.substring(0, right_cycle_text.lastIndexOf(' '));
String med_cycle_text = left_cycle_text.substring(0, left_cycle_text.lastIndexOf(' '));
if (left_cycle_pat.matches("(?i)" + rules.getRE())) {
String rule = rules.getMapValue(left_cycle_pat); //return timex_text.replaceAll("([^ ]+( |$))+", rule);
return applyRule(left_cycle_text, left_cycle_pat, rule);
} else {
if (right_cycle_pat.matches("(?i)" + rules.getRE())) {
String rule = rules.getMapValue(right_cycle_pat); //return timex_text.replaceAll("([^ ]+( |$))+", rule);
return applyRule(right_cycle_text, right_cycle_pat, rule);
} else {
if (med_cycle_pat.matches("(?i)" + rules.getRE())) {
String rule = rules.getMapValue(med_cycle_pat); //return timex_text.replaceAll("([^ ]+( |$))+", rule);
return applyRule(med_cycle_text, med_cycle_pat, rule);
}
}
}
}
}
norm_value = "NO RULE FOUND FOR: " + normTextandPattern;
/*
TIMEX_Instance timex_object = new TIMEX_Instance(normText, tense, dct, reftime);
ArrayList<Rule> rules_found;
for (int level = 1; level <= 3; level++) {
rules_found = get_rules_from_db("RULES_LEVEL" + level, pattern);
norm_value = apply_rules(rules_found, pattern, timex_object);
if (!norm_value.equals("default_norm")) {
break;
}
}
// reduce left-right
while (pattern.split("_").length > 1 && norm_value.equals("default_norm")) {
normText = normText.replaceFirst("[^_]+_", "");
pattern = pattern.replaceFirst("[^_]+_", "");
timex_object = new TIMEX_Instance(normText, tense, dct, reftime);
for (int level = 1; level <= 3; level++) {
rules_found = get_rules_from_db("RULES_LEVEL" + level, pattern);
norm_value = apply_rules(rules_found, pattern, timex_object);
if (!norm_value.equals("default_norm")) {
break;
}
}
}
// reduce right-left?
// other heuristics?
*/
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.getMessage() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
}
System.exit(1);
}
return norm_value;
}
/**
* Given a rule and a matched timex text/pattern output the normalized value
*
* @param timex_text
* @param timex_pattern
* @param rule
* @return
*/
public String applyRule(String timex_text, String timex_pattern, String rule) throws Exception {
if (rule != null) {
// Using tokens as elements for replacements $1...$n
String[] timex_arr = timex_text.split(" ");
if (rule.contains("if(")) {
String condition = rule.substring(3, rule.indexOf(')'));
Pattern p = Pattern.compile("\\$[0-9]+ ");
Matcher m = p.matcher(condition);
//System.out.println(rule);
while (m.find()) {
String var = m.group().replace("$", "").trim();
String val = timex_arr[Integer.parseInt(var) - 1];
condition = condition.replaceAll("\\$" + var + " ", val + " ");
}
//System.out.println(condition);
ScriptEngine engine = new ScriptEngineManager().getEngineByName("JavaScript");
if (engine.eval(condition).toString().equals("true")) {
//System.out.println("true");
rule = rule.substring(rule.indexOf("){") + 2, rule.indexOf("}else{"));
} else {
//System.out.println("false");
rule = rule.substring(rule.indexOf("}else{") + 6, rule.lastIndexOf("}"));
}
//System.out.println("debug: "+rule);
}
// Variable replacement based on Java Regex: $NUM only checked for possible backreferences
// It is not possible to replace $12 as $1 + '2'. In this case we have $(num) to allow that
for (int i = timex_arr.length; i > 0; i--) {
// Simplified replacement
if (rule.contains("$" + i)) {
rule = rule.replace("$" + i, timex_arr[i - 1]); // .substring(3) not needed v__ removed before
}
// Normal replacement
if (rule.contains("$(" + i + ")")) {
rule = rule.replace("$(" + i + ")", timex_arr[i - 1]);
}
}
// Sub replacement: Only checks for existing back-references
if (rule.matches(".*\\$\\([0-9]+,[_:./-]+,[0-9]+\\).*")) {
Pattern p = Pattern.compile("(\\$\\([0-9]+,[_:./-]+,[0-9]+\\))");
Matcher m = p.matcher(rule);
while (m.find()) {
String var = m.group();
String[] expr = var.substring(2, var.length() - 1).split(","); //.replaceAll("^\\$\\(", "").replaceAll("\\)$", "").trim();
if (Integer.parseInt(expr[0]) <= timex_arr.length) {
String val = timex_arr[Integer.parseInt(expr[0]) - 1];
String[] val_arr = val.split(expr[1]);
if (Integer.parseInt(expr[2]) <= val_arr.length) {
val = val_arr[Integer.parseInt(expr[2]) - 1];
rule = rule.replace(var, val);
}
}
}
}
// ALTERNATIVE: Using java standard regex group matching for replacements. timex_text needs to be mapped
// System.out.println(rule+" "+timex_text);
// PROBLEM, CURRENTLY This is implemented as pattern lookup in a hashmap which is efficient, for the other solution
// we need to do a rule search loop with pattern matching, although maybe the pattern/rule understanding by users is better
// In short, doing things at loading stage is normally more efficient than doing it at runtime... the problem is running the program for a single timex
// but if you run it for a set of timexes it will be more efficient if you preload patterns in a hashmap, also you can check for duplicates
return rule;
} else {
return "Strange error... check rules... RULE matched but not foudn... FOR " + timex_text + "|" + timex_pattern;
}
}
}