package com.cognitionis.nlp_knowledge.time; import com.cognitionis.nlp_files.PhraselistFile; import com.cognitionis.nlp_knowledge.numbers.Numek; import com.cognitionis.utils_basickit.FileUtils; import java.io.File; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.cognitionis.nlp_files.LengthAlphabeticalComparator; public class Timek { // this could probably be loaded from json confing public static final String[] phraselist_names = {"weekday", "month", "tunit", "decade", "deictic", "time_of_day", "season", "after_before", "modifier", "relative_ord"}; public HashMap<String, PhraselistFile> phraselists = new HashMap<>(); public TreeMap<String, String[]> multitokens = new TreeMap<>(new LengthAlphabeticalComparator()); // merger of all multitokens including their class public String multitokens_re = "_no_regex_to_match_"; // merger of all multitokens // this can probably be an array of dependent knowledges public Numek numek; // this could probably be an abstract class public Locale locale; public HashSet<String> all_keys; public HashSet<String> repeated_keys; public PhraselistFile ambiguous; public PhraselistFile useless_symbols; /* // set TODO develop the phraselist... or somethign else // todo TIMEgranul_re = "(?i)(?:seconds|minute(?:s)?|hour(?:s)?|" + TOD_re + ")"; * */ // DEPRECATED could be guessed from phraselists // DEPRECATED ALL START WITH c_ public static final String pattern_symbols = "(Card|Ord|c_Tunit|c_Month|c_Weekday|c_Time_of_day|c_Decade|c_Deictic|c_Season|ISOTime|ISODate|.*-Ambiguous)"; // use TIgnore to force decoding in desambiguation public Timek() { this(new Locale("en", "US")); } public Timek(Locale l) { this(l, "resources"); } public Timek(Locale l, String resources_dir) { try { locale = l; numek = new Numek(l, resources_dir); String lang = l.toString().replace('_', '-'); String shortlang = lang.substring(0, 2); String resource_separator=File.separator; all_keys = new HashSet<>(); repeated_keys = new HashSet<>(); phraselists = new HashMap<>(); all_keys = numek.all_keys; // Load from resource knowledge files (all string, string) String res_path = FileUtils.getResourcesPath(Timek.class,resources_dir + File.separator + "time" + File.separator); if(res_path.contains("/")){ resource_separator="/"; } if (!FileUtils.URL_exists(res_path + lang + resource_separator)) { res_path = res_path + shortlang + resource_separator; } else { res_path = res_path + lang + resource_separator; } if (!FileUtils.URL_exists(res_path)) { throw new Exception("Not-supported locale: " + lang + " nor " + shortlang); } else { // these are separated because they allow regex or special things if (FileUtils.URL_exists(res_path + "ambiguous.phraselist")) { ambiguous = new PhraselistFile(res_path + "ambiguous.phraselist", false, locale, true, true,true); } if (FileUtils.URL_exists(res_path + "useless_symbol.phraselist")) { useless_symbols = new PhraselistFile(res_path + "useless_symbol.phraselist", false, locale, false, true,true); } // TODO: this should only work for some required phraselists for (String phra : phraselist_names) { if (FileUtils.URL_exists(res_path + phra + ".phraselist")) { phraselists.put(phra, new PhraselistFile(res_path + phra + ".phraselist", false, locale, false, false,false)); repeated_keys.addAll(phraselists.get(phra).intersectPhraselist(all_keys)); all_keys.addAll(phraselists.get(phra).keySet()); if (!phraselists.get(phra).getMultiRE().equals("_no_regex_to_match_")) { PhraselistFile.mergeMaps(multitokens, phraselists.get(phra).getMultiMap(), phra); } } // ELSE REQUIRED PHRASELIST DOES NOT EXIST } multitokens_re = PhraselistFile.get_re_from_keyset(multitokens.keySet()); // set TODO develop the phraselist... or somethign else // todo TIMEgranul_re = "(?i)(?:seconds|minute(?:s)?|hour(?:s)?|" + TOD_re + ")"; if (ambiguous != null) { for (String akey : ambiguous.keySet()) { HashSet<String> temp_keys = new HashSet<>(repeated_keys); for (String key : repeated_keys) { if (akey.contains(key)) { temp_keys.remove(key); } } repeated_keys.clear(); repeated_keys.addAll(temp_keys); } } if (!repeated_keys.isEmpty()) { throw new Exception("This knowledge element has unhandled ambiguity: " + repeated_keys); } } } catch (Exception e) { System.err.println("Errors found in " + this.getClass().getName() + ":\n\t" + e.toString()); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); } } } /** * Returns an unambiguous semi-text|semi-pattern NOTE: Ambiguous RE must not * contain multi-word replacements * * @param text * * @return String: unambiguous text */ public final String disambiguate(String text) { //System.out.println("disambig call: "+text); String pat = text; if (ambiguous != null) { Pattern pa = Pattern.compile(ambiguous.getRE(), Pattern.CASE_INSENSITIVE); Matcher ma = pa.matcher(text); if (ma.find()) { // open the ambiguities file, iterate, if matches replace... for (String key : ambiguous.getMap().keySet()) { // NOTE: negative patterns cannot be used unless se use another paramter (match-negative-replacement), otherwise everything matches /* Boolean positive=true; String negkey=""; if(key.contains("-") && key.split("-")[0].equalsIgnoreCase("!negative")){ positive=false; negkey=key.split("-")[1]; } if (positive && text.matches(key)) {*/ //System.out.println("debug111: "+key); Pattern ambig = Pattern.compile(key, Pattern.CASE_INSENSITIVE); Matcher m = ambig.matcher(text); if (m.find()) { pat = text.replaceAll(key, ambiguous.getMapValue(key)); // in the file use TIgnore to force decoding in a subsequent step String[] normarr = text.trim().split(" "); String[] patarr = pat.trim().split(" "); text = ""; pat=""; // THIS DOES NOT WORK FOR Multi-word phraselist... nor for regex based values... e.g., Mon(\\.)? for (int i = 0; i < patarr.length; i++) { if (patarr[i].matches("^c_keep$")){ patarr[i]=normarr[i].replaceAll("v__", ""); normarr[i]="v__" +normarr[i]; } if (patarr[i].matches("^c_.*") && !normarr[i].startsWith("v__")) {// pero açò es un poc merda... normarr[i] = "v__" + phraselists.get(patarr[i].substring(2).toLowerCase()).getMapValue(normarr[i]); } text += " " + normarr[i]; pat += " " + patarr[i]; } text += " "; pat += " "; } /*} if (!positive && !text.matches(key)) { return text.replaceAll(key,ambiguous.getMapValue(key)); }*/ } } } return text.trim() + "|" + pat.trim(); } /** * Removes all useless symbols from an input text * * @param text expects space-separated symbols including leading and * training spaces * @return clean text */ public final String removeUselessSymbols(String text) { if (useless_symbols.getRE() != null) { return text.replaceAll(" " + useless_symbols.getRE() + " ", " "); } else { return text; } } /** * ***************************************************************** * NORMALIZING TEXT INPUT * **************************************************************** */ /** * Obtains the normalized text (NormText) and Patter from a given timex * textual expression * * @param timex the timex textual expression (by default it is case * insensitive) * @return the feature-values for NormText and Pattern (i.e., * normtext|pattern) */ public String getNormTextandPattern(String timex_text) { return getNormTextandPattern(timex_text, Boolean.FALSE); } /** * Obtains the normalized text (NormText) and Patter from a given timex * textual expression * * @param timex the timex textual expression * @param case_sensitive boolean selector (by default it is false == case * insensitive) * * @return the feature-values for NormText and Pattern (i.e., * normtext|pattern) */ public String getNormTextandPattern(String timex_text, Boolean case_sensitive) { String timex_normtext = ""; String timex_pattern = ""; String modifiers = ""; // mid,late,early,almost,approx... try { // BASIC CLEAN-UP ----------------------------------------------------------------------------------- timex_text = " " + timex_text.replaceAll("\\s+", " ") + " "; // Ensure correct tokenization if (!case_sensitive) { timex_text = timex_text.toLowerCase(); //make it all-lowercase } timex_text = timex_text.replaceAll(" ,", "").replaceAll(", ", " "); // remove tokenized commas untokenized commas // REMOVE USELESS SYMBOLS: only if they are completely useless // YES: "of" is useless in English dates // NO: "los" in Spanish is useful to disambiguate between DATE and SET timex_text = this.removeUselessSymbols(timex_text); // Unify ISO dates timex_text = timex_text.replaceAll("([0-9]+) ([-/:]) ([0-9]+|" + this.phraselists.get("month").getRE() + ") ([-/:]) ([0-9]+)", "$1$2$3$4$5"); timex_text = timex_text.replaceAll("([0-9]+[-/:]) ((?:[0-9]+|" + this.phraselists.get("month").getRE() + ")[-/:]) ([0-9]+)", "$1$2$3"); timex_text = timex_text.replaceAll("([0-9]+) ([-/:](?:[0-9]+|" + this.phraselists.get("month").getRE() + ")) ([-/:][0-9]+)", "$1$2$3"); timex_text = timex_text.replaceAll("([0-9]+|" + this.phraselists.get("month").getRE() + ") ([-/:]) ([0-9]+)", "$1$2$3"); timex_text = timex_text.replaceAll("((?:[0-9]+|" + this.phraselists.get("month").getRE() + ")[-/:]) ([0-9]+)", "$1$2"); timex_text = timex_text.replaceAll("([0-9]+|" + this.phraselists.get("month").getRE() + ") ([-/:][0-9]+)", "$1$2"); timex_text = timex_text.replaceAll("([0-9]0)s", "$1 s"); // Special for modifiers (SHOULD be loaded from a phraselist) timex_text = timex_text.replaceAll("mid(?:-)?([0-9]+)", "mid $1").replaceAll("mid-(.+)", "mid $1"); // Separate adjective periods num-TUnit timex_text = timex_text.replaceAll("([^ ]+)-" + this.phraselists.get("tunit").getRE(), "$1 $2"); // Special for fractions (only one is normalized because there should be no more than one per timex) if (timex_text.matches("(?:.* )?(?:[0-9]* )?[1-9][0-9]*/[1-9][0-9]* " + this.phraselists.get("tunit").getRE() + ".*")) { String nums2norm = timex_text.replaceFirst("(.* )?((?:[0-9]* )?[1-9][0-9]*/[1-9][0-9]*)( " + this.phraselists.get("tunit").getRE() + ".*)", "$2"); String normalizedfrac = "" + Numek.calc_and_sum_frac(nums2norm); timex_text = timex_text.replaceFirst("(.* )?((?:[0-9]* )?[1-9][0-9]*/[1-9][0-9]*)( " + this.phraselists.get("tunit").getRE() + ".*)", "$1" + normalizedfrac + "$3"); } //if(this.numek.ordinal_suffixes!=null){ // needed because ordinal card th disambiguation is not included in time.ambiguous // timex_text = timex_text.replaceAll(" ([0-9]+)\\s+("+this.numek.ordinal_suffixes.getRE()+") ", " $1$2 "); //} // DISAMBIGUATE TO PATTERN IF NEEDED ---------------------------------------------------------------------------- // ambiguity (replace also text v_ except c_Card, c_Ord) String norm_and_pat = this.disambiguate(timex_text); // if there is nothing to disambiguate it will return same text and pattern timex_text = norm_and_pat.split("\\|")[0].trim(); timex_pattern = norm_and_pat.split("\\|")[1].trim(); String[] textarr = timex_text.split(" "); String[] patternarr = timex_pattern.split(" "); timex_pattern = ""; // reset to build //numbers and ISO (Replace both in text v__ and in Pattern c_) // check Nums and ISO (e.g., one million or 25 hundred), if after [0-9] there is no spell leave as it is. String currentPat; for (int i = 0; i < textarr.length; i++) { if (patternarr[i].startsWith("c_") || textarr[i].startsWith("v__")) { currentPat = patternarr[i]; } else { if (textarr[i].matches("(?:[0-2])?[0-9][.:][0-5][0-9](?:[.:][0-5][0-9])?(?:(?:p|a)(?:\\.)?m(?:\\.)?|h)?") || textarr[i].matches("(?:[0-2])?[0-9](?:(?:p|a)(?:\\.)?m(?:\\.)?)")) { currentPat = "c_isotime"; } else { if (textarr[i].matches("(?:[0-3])?[0-9][./-](?:(?:[0-3])?[0-9]|" + this.phraselists.get("month").getRE() + ")[./-][0-9]+") // dd-mm-yyyy || textarr[i].matches(this.phraselists.get("month").getRE() + "[/-][0-9]+") // MM-yyyy || textarr[i].matches("(?:1[0-2]|(?:0)?[1-9])[/-][1-2][0-9]{3}") // mm-yyyy || textarr[i].matches("[0-9]{4}[./-](?:1[0-2]|(?:0)?[1-9])[./-](?:[0-3])?[0-9](?:(T| )[0-2][0-9][.:][0-5][0-9](?:[.:][0-5][0-9])?)?(?:Z)?") // ISO ) { currentPat = "c_isodate"; } else { currentPat = textarr[i]; } } } timex_normtext += " " + textarr[i]; timex_pattern += " " + currentPat; } // Normalize numbers String normTextandPattern = this.numek.getNormTextandPattern(timex_normtext + " ", timex_pattern + " "); String[] normTextandPattern_arr = normTextandPattern.split("\\|"); timex_normtext = normTextandPattern_arr[0]; timex_pattern = normTextandPattern_arr[1]; // 3 replace all (for each RE) in both text and pattern (don't worry... there wont be strange matches because v__) // if (text matches ^( v__[^\s])+ $ )) break (completely paternized) timex_normtext = " " + timex_normtext + " "; timex_pattern = " " + timex_pattern + " "; Pattern p = Pattern.compile(" " + (multitokens_re).replaceAll("\\\\\\\\", "\\\\") + " "); Matcher m = p.matcher(timex_normtext); while (m.find()) { timex_normtext = timex_normtext.replaceAll("(?i)" + m.group(), " v__" + multitokens.get(m.group().trim())[0] + " "); timex_pattern = timex_pattern.replaceAll("(?i)" + m.group(), " " + multitokens.get(m.group().trim())[1] + " "); } for (String phraselist : this.phraselists.keySet()) { p = Pattern.compile(" " + (this.phraselists.get(phraselist).getRE()).replaceAll("\\\\\\\\", "\\\\") + " "); //, Pattern.CASE_INSENSITIVE this must be handled with a parameter (default insensitive, all lowercap) m = p.matcher(timex_normtext); while (m.find()) { timex_normtext = timex_normtext.replaceAll("(?i)" + m.group(), " v__" + this.phraselists.get(phraselist).getMapValue(m.group().trim()) + " "); timex_pattern = timex_pattern.replaceAll("(?i)" + m.group(), " " + this.phraselists.get(phraselist).getName() + " "); } } } catch (Exception e) { System.err.println("Errors found:\n\t" + e.toString() + "\n"); if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { e.printStackTrace(System.err); System.exit(1); } return null; } return (timex_normtext.trim().replaceAll("v__", "") + "|" + timex_pattern.trim()); } }