Timek.java example

Explorer

cognitionis-nlp-libraries-master
- external-tools
  - src
    - main
      - java
        com
        cognitionis
        external_tools
        CRF.java
        CoNLL_scorer.java
        FreeLing.java
        Main.java
        MaltParser.java
        SRL_Roth.java
        SVM.java
        TempEval_scorer.java
        Tokenizer_TreeTagger.java
        TreeTagger.java
        WNInterface.java
- feature-builder
  - src
    - main
      - java
        com
        cognitionis
        feature_builder
        BaseTokenFeatures.java
        CategorizationTE2.java
        Classification.java
        Main.java
        Timen.java
        TimexNormalization.java
- jtimegraph
  - src
    - main
      - java
        com
        cognitionis
        jtimegraph
        Main.java
        gregoriangraph
        GregorianGraph.java
        GregorianPoint.java
        timegraph
        Chain.java
        TimeGraph.java
        TimePoint.java
- knowledgek
  - src
    - main
      - java
        com
        cognitionis
        knowledgek
        Main.java
        NUMEK
        NUMEK.java
        TIMEK
        TIMEK.java
        VerbAttributesK.java
- nlp-files
  - src
    - main
      - java
        com
        cognitionis
        nlp_files
        LengthAlphabeticalComparator.java
        Main.java
        NLPFile.java
        NgramHandler.java
        PhraselistFile.java
        PipesFile.java
        PlainFile.java
        RegexPhraselistFile.java
        Stat.java
        TabFile.java
        TempEvalFiles.java
        TokenizedFile.java
        TokenizedPerSentenceFile.java
        TransduceRulelistFile.java
        TreebankFile.java
        XMLFile.java
        annotation_scorers
        Judgement.java
        Scomp.java
        Score.java
        Scorer.java
        parentical_parsers
        SRLColParser.java
        SyntColParser.java
        SyntColSBarTMPRoleParser.java
- nlp-knowledge
  - src
    - main
      - java
        com
        cognitionis
        nlp_knowledge
        Main.java
        numbers
        Numek.java
        time
        Timek.java
        TimexNormalizer.java
        TimexResolver.java
    - test
      - java
        com
        cognitionis
        nlp_knowledge
        numbers
        NumekTest.java
        time
        TimekTest.java
        TimexNormalizerTest.java
- nlp-lang-models
  - src
    - main
      - java
        com
        cognitionis
        nlp_lang_models
        Main.java
        TextCategorizer.java
        TextCategorizerFingerprint.java
    - test
      - java
        com
        cognitionis
        nlp_lang_models
        TextCategorizerTest.java
- nlp-segmentation
  - src
    - main
      - java
        com
        cognitionis
        nlp_segmentation
        Aligner.java
        Main.java
        SentSplit.java
        Tokenizer_PTB_Rulebased.java
    - test
      - java
        com
        cognitionis
        nlp_segmentation
        TokenizerTest.java
- nlp-taggers
  - src
    - main
      - java
        com
        cognitionis
        nlp_taggers
        Baseline_MostFrequentTag.java
        HMM.java
        Main.java
        Tagger.java
- nlpbt
  - src
    - main
      - java
        com
        cognitionis
        nlpbt
        Main.java
- timeml-basickit
  - src
    - main
      - java
        com
        cognitionis
        timeml_basickit
        Element.java
        Event.java
        Link.java
        Main.java
        TML_file_utils.java
        TimeML.java
        TimeReference.java
        Timex.java
        comparators
        AscINT_eiid_Comparator.java
        AscINT_lid_Comparator.java
        AscStringTimeRefMapComparator.java
        AscStringTimexMapComparator.java
- utils-basickit
  - src
    - main
      - java
        com
        cognitionis
        utils_basickit
        AscStringIntMapComparator.java
        DateUtils.java
        DescStringIntMapComparator.java
        DescStringIntMapEntryListComparator.java
        FileUtils.java
        Main.java
        MapUtils.java
        SAXReader.java
        StringUtils.java
        Xml2PlainHandler.java
        XmlAttribs.java
        statistics
        T_test.java
- wiki-basickit
  - src
    - main
      - java
        com
        cognitionis
        wiki_basickit
        DBpedia_bk.java
        Main.java
        WikiHtml2PlainESHandler.java
        WikiHtml2PlainHandler.java
        Wiki_bk.java

package com.cognitionis.nlp_knowledge.time;

import com.cognitionis.nlp_files.PhraselistFile;
import com.cognitionis.nlp_knowledge.numbers.Numek;
import com.cognitionis.utils_basickit.FileUtils;
import java.io.File;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.cognitionis.nlp_files.LengthAlphabeticalComparator;

public class Timek {
    // this could probably be loaded from json confing

    public static final String[] phraselist_names = {"weekday", "month", "tunit", "decade", "deictic", "time_of_day", "season", "after_before", "modifier", "relative_ord"};
    public HashMap<String, PhraselistFile> phraselists = new HashMap<>();
    public TreeMap<String, String[]> multitokens = new TreeMap<>(new LengthAlphabeticalComparator()); // merger of all multitokens including their class
    public String multitokens_re = "_no_regex_to_match_"; // merger of all multitokens
    // this can probably be an array of dependent knowledges
    public Numek numek;
    // this could probably be an abstract class
    public Locale locale;
    public HashSet<String> all_keys;
    public HashSet<String> repeated_keys;
    public PhraselistFile ambiguous;
    public PhraselistFile useless_symbols;

    /*
     // set TODO develop the phraselist... or somethign else
     // todo TIMEgranul_re = "(?i)(?:seconds|minute(?:s)?|hour(?:s)?|" + TOD_re + ")";
     * */
    // DEPRECATED could be guessed from phraselists
    // DEPRECATED ALL START WITH c_ public static final String pattern_symbols = "(Card|Ord|c_Tunit|c_Month|c_Weekday|c_Time_of_day|c_Decade|c_Deictic|c_Season|ISOTime|ISODate|.*-Ambiguous)"; // use TIgnore to force decoding in desambiguation
    public Timek() {
        this(new Locale("en", "US"));
    }

    public Timek(Locale l) {
        this(l, "resources");
    }

    public Timek(Locale l, String resources_dir) {
        try {
            locale = l;
            numek = new Numek(l, resources_dir);
            String lang = l.toString().replace('_', '-');
            String shortlang = lang.substring(0, 2);
            String resource_separator=File.separator;
            all_keys = new HashSet<>();
            repeated_keys = new HashSet<>();
            phraselists = new HashMap<>();

            all_keys = numek.all_keys;
            // Load from resource knowledge files (all string, string)            
            String res_path = FileUtils.getResourcesPath(Timek.class,resources_dir + File.separator + "time" + File.separator);

            if(res_path.contains("/")){
                resource_separator="/";
            }           
            if (!FileUtils.URL_exists(res_path + lang + resource_separator)) {
                res_path = res_path + shortlang + resource_separator;
            } else {
                res_path = res_path + lang + resource_separator;
            }

            if (!FileUtils.URL_exists(res_path)) {
                throw new Exception("Not-supported locale: " + lang + " nor " + shortlang);
            } else {
                // these are separated because they allow regex or special things
                if (FileUtils.URL_exists(res_path + "ambiguous.phraselist")) {
                    ambiguous = new PhraselistFile(res_path + "ambiguous.phraselist", false, locale, true, true,true);
                }
                if (FileUtils.URL_exists(res_path + "useless_symbol.phraselist")) {
                    useless_symbols = new PhraselistFile(res_path + "useless_symbol.phraselist", false, locale, false, true,true);
                }

                // TODO: this should only work for some required phraselists
                for (String phra : phraselist_names) {
                    if (FileUtils.URL_exists(res_path + phra + ".phraselist")) {
                        phraselists.put(phra, new PhraselistFile(res_path + phra + ".phraselist", false, locale, false, false,false));
                        repeated_keys.addAll(phraselists.get(phra).intersectPhraselist(all_keys));
                        all_keys.addAll(phraselists.get(phra).keySet());
                        if (!phraselists.get(phra).getMultiRE().equals("_no_regex_to_match_")) {
                            PhraselistFile.mergeMaps(multitokens, phraselists.get(phra).getMultiMap(), phra);
                        }
                    }
                    // ELSE REQUIRED PHRASELIST DOES NOT EXIST
                }
                multitokens_re = PhraselistFile.get_re_from_keyset(multitokens.keySet());
                // set TODO develop the phraselist... or somethign else
                // todo TIMEgranul_re = "(?i)(?:seconds|minute(?:s)?|hour(?:s)?|" + TOD_re + ")";
                if (ambiguous != null) {
                    for (String akey : ambiguous.keySet()) {
                        HashSet<String> temp_keys = new HashSet<>(repeated_keys);
                        for (String key : repeated_keys) {
                            if (akey.contains(key)) {
                                temp_keys.remove(key);
                            }
                        }
                        repeated_keys.clear();
                        repeated_keys.addAll(temp_keys);
                    }
                }
                if (!repeated_keys.isEmpty()) {
                    throw new Exception("This knowledge element has unhandled ambiguity: " + repeated_keys);
                }
            }
        } catch (Exception e) {
            System.err.println("Errors found in " + this.getClass().getName() + ":\n\t" + e.toString());
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
            }
        }
    }

    /**
     * Returns an unambiguous semi-text|semi-pattern NOTE: Ambiguous RE must not
     * contain multi-word replacements
     *
     * @param text
     *
     * @return String: unambiguous text
     */
    public final String disambiguate(String text) {
        //System.out.println("disambig call: "+text);
        String pat = text;
        if (ambiguous != null) {
            Pattern pa = Pattern.compile(ambiguous.getRE(), Pattern.CASE_INSENSITIVE);
            Matcher ma = pa.matcher(text);
            if (ma.find()) {
                // open the ambiguities file, iterate, if matches replace...
                for (String key : ambiguous.getMap().keySet()) {
                    // NOTE: negative patterns cannot be used unless se use another paramter (match-negative-replacement), otherwise everything matches
                /* Boolean positive=true;
                     String negkey="";
                     if(key.contains("-") && key.split("-")[0].equalsIgnoreCase("!negative")){
                     positive=false;
                     negkey=key.split("-")[1];
                     }
                     if (positive && text.matches(key)) {*/
                    //System.out.println("debug111: "+key);
                    Pattern ambig = Pattern.compile(key, Pattern.CASE_INSENSITIVE);
                    Matcher m = ambig.matcher(text);
                    if (m.find()) {
                        pat = text.replaceAll(key, ambiguous.getMapValue(key)); // in the file use TIgnore to force decoding in a subsequent step
                        String[] normarr = text.trim().split(" ");
                        String[] patarr = pat.trim().split(" ");
                        text = "";
                        pat="";
                        // THIS DOES NOT WORK FOR Multi-word phraselist... nor for regex based values... e.g., Mon(\\.)?
                        for (int i = 0; i < patarr.length; i++) {
                            if (patarr[i].matches("^c_keep$")){
                                patarr[i]=normarr[i].replaceAll("v__", "");
                                normarr[i]="v__" +normarr[i];
                            }
                            if (patarr[i].matches("^c_.*") && !normarr[i].startsWith("v__")) {// pero açò es un poc merda...                        
                                normarr[i] = "v__" + phraselists.get(patarr[i].substring(2).toLowerCase()).getMapValue(normarr[i]);
                            }
                            text += " " + normarr[i];
                            pat  += " " + patarr[i];
                        }
                        text += " ";
                        pat  += " ";
                    }
                    /*}
                     if (!positive && !text.matches(key)) {
                     return text.replaceAll(key,ambiguous.getMapValue(key));
                     }*/
                }
            }
        }
        return text.trim() + "|" + pat.trim();
    }

    /**
     * Removes all useless symbols from an input text
     *
     * @param text expects space-separated symbols including leading and
     * training spaces
     * @return clean text
     */
    public final String removeUselessSymbols(String text) {
        if (useless_symbols.getRE() != null) {
            return text.replaceAll(" " + useless_symbols.getRE() + " ", " ");
        } else {
            return text;
        }
    }

    /**
     * *****************************************************************
     * NORMALIZING TEXT INPUT
     * ****************************************************************
     */
    /**
     * Obtains the normalized text (NormText) and Patter from a given timex
     * textual expression
     *
     * @param timex the timex textual expression (by default it is case
     * insensitive)
     * @return the feature-values for NormText and Pattern (i.e.,
     * normtext|pattern)
     */
    public String getNormTextandPattern(String timex_text) {
        return getNormTextandPattern(timex_text, Boolean.FALSE);
    }

    /**
     * Obtains the normalized text (NormText) and Patter from a given timex
     * textual expression
     *
     * @param timex the timex textual expression
     * @param case_sensitive boolean selector (by default it is false == case
     * insensitive)
     *
     * @return the feature-values for NormText and Pattern (i.e.,
     * normtext|pattern)
     */
    public String getNormTextandPattern(String timex_text, Boolean case_sensitive) {
        String timex_normtext = "";
        String timex_pattern = "";
        String modifiers = ""; // mid,late,early,almost,approx... 
        try {
            // BASIC CLEAN-UP  -----------------------------------------------------------------------------------
            timex_text = " " + timex_text.replaceAll("\\s+", " ") + " "; // Ensure correct tokenization 
            if (!case_sensitive) {
                timex_text = timex_text.toLowerCase(); //make it all-lowercase
            }
            timex_text = timex_text.replaceAll(" ,", "").replaceAll(", ", " "); // remove tokenized commas untokenized commas
            // REMOVE USELESS SYMBOLS: only if they are completely useless
            // YES: "of" is useless in English dates
            // NO: "los" in Spanish is useful to disambiguate between DATE and SET
            timex_text = this.removeUselessSymbols(timex_text);
            // Unify ISO dates
            timex_text = timex_text.replaceAll("([0-9]+) ([-/:]) ([0-9]+|" + this.phraselists.get("month").getRE() + ") ([-/:]) ([0-9]+)", "$1$2$3$4$5");
            timex_text = timex_text.replaceAll("([0-9]+[-/:]) ((?:[0-9]+|" + this.phraselists.get("month").getRE() + ")[-/:]) ([0-9]+)", "$1$2$3");
            timex_text = timex_text.replaceAll("([0-9]+) ([-/:](?:[0-9]+|" + this.phraselists.get("month").getRE() + ")) ([-/:][0-9]+)", "$1$2$3");
            timex_text = timex_text.replaceAll("([0-9]+|" + this.phraselists.get("month").getRE() + ") ([-/:]) ([0-9]+)", "$1$2$3");
            timex_text = timex_text.replaceAll("((?:[0-9]+|" + this.phraselists.get("month").getRE() + ")[-/:]) ([0-9]+)", "$1$2");
            timex_text = timex_text.replaceAll("([0-9]+|" + this.phraselists.get("month").getRE() + ") ([-/:][0-9]+)", "$1$2");
            timex_text = timex_text.replaceAll("([0-9]0)s", "$1 s");
            // Special for modifiers (SHOULD be loaded from a phraselist)
            timex_text = timex_text.replaceAll("mid(?:-)?([0-9]+)", "mid $1").replaceAll("mid-(.+)", "mid $1");
            // Separate adjective periods num-TUnit
            timex_text = timex_text.replaceAll("([^ ]+)-" + this.phraselists.get("tunit").getRE(), "$1 $2");
            // Special for fractions (only one is normalized because there should be no more than one per timex)
            if (timex_text.matches("(?:.* )?(?:[0-9]* )?[1-9][0-9]*/[1-9][0-9]* " + this.phraselists.get("tunit").getRE() + ".*")) {
                String nums2norm = timex_text.replaceFirst("(.* )?((?:[0-9]* )?[1-9][0-9]*/[1-9][0-9]*)( " + this.phraselists.get("tunit").getRE() + ".*)", "$2");
                String normalizedfrac = "" + Numek.calc_and_sum_frac(nums2norm);
                timex_text = timex_text.replaceFirst("(.* )?((?:[0-9]* )?[1-9][0-9]*/[1-9][0-9]*)( " + this.phraselists.get("tunit").getRE() + ".*)", "$1" + normalizedfrac + "$3");
            }

            //if(this.numek.ordinal_suffixes!=null){ // needed because ordinal card th disambiguation is not included in time.ambiguous
            //    timex_text = timex_text.replaceAll(" ([0-9]+)\\s+("+this.numek.ordinal_suffixes.getRE()+") ", " $1$2 ");
            //}

            // DISAMBIGUATE TO PATTERN IF NEEDED  ----------------------------------------------------------------------------
            // ambiguity (replace also text v_ except c_Card, c_Ord) 
            String norm_and_pat = this.disambiguate(timex_text); // if there is nothing to disambiguate it will return same text and pattern
            timex_text = norm_and_pat.split("\\|")[0].trim();
            timex_pattern = norm_and_pat.split("\\|")[1].trim();

            String[] textarr = timex_text.split(" ");
            String[] patternarr = timex_pattern.split(" ");
            timex_pattern = ""; // reset to build


            //numbers and ISO (Replace both in text v__ and in Pattern c_)            
            // check Nums and ISO (e.g., one million or 25 hundred), if after [0-9] there is no spell leave as it is.
            String currentPat;
            for (int i = 0; i < textarr.length; i++) {
                if (patternarr[i].startsWith("c_") || textarr[i].startsWith("v__")) {
                    currentPat = patternarr[i];
                } else {
                    if (textarr[i].matches("(?:[0-2])?[0-9][.:][0-5][0-9](?:[.:][0-5][0-9])?(?:(?:p|a)(?:\\.)?m(?:\\.)?|h)?") || textarr[i].matches("(?:[0-2])?[0-9](?:(?:p|a)(?:\\.)?m(?:\\.)?)")) {
                        currentPat = "c_isotime";
                    } else {
                        if (textarr[i].matches("(?:[0-3])?[0-9][./-](?:(?:[0-3])?[0-9]|" + this.phraselists.get("month").getRE() + ")[./-][0-9]+") // dd-mm-yyyy
                                || textarr[i].matches(this.phraselists.get("month").getRE() + "[/-][0-9]+") // MM-yyyy
                                || textarr[i].matches("(?:1[0-2]|(?:0)?[1-9])[/-][1-2][0-9]{3}") // mm-yyyy
                                || textarr[i].matches("[0-9]{4}[./-](?:1[0-2]|(?:0)?[1-9])[./-](?:[0-3])?[0-9](?:(T| )[0-2][0-9][.:][0-5][0-9](?:[.:][0-5][0-9])?)?(?:Z)?") // ISO
                                ) {
                            currentPat = "c_isodate";
                        } else {
                            currentPat = textarr[i];
                        }
                    }
                }
                timex_normtext += " " + textarr[i];
                timex_pattern += " " + currentPat;
            }

           
            // Normalize numbers
            String normTextandPattern = this.numek.getNormTextandPattern(timex_normtext + " ", timex_pattern + " ");
            String[] normTextandPattern_arr = normTextandPattern.split("\\|");
            timex_normtext = normTextandPattern_arr[0];
            timex_pattern = normTextandPattern_arr[1];

            //  3 replace all (for each RE) in both text and pattern (don't worry... there wont be strange matches because v__)
            //  if (text matches ^( v__[^\s])+ $ )) break (completely paternized)
            timex_normtext = " " + timex_normtext + " ";
            timex_pattern = " " + timex_pattern + " ";
            Pattern p = Pattern.compile(" " + (multitokens_re).replaceAll("\\\\\\\\", "\\\\") + " ");
            Matcher m = p.matcher(timex_normtext);

            while (m.find()) {
                timex_normtext = timex_normtext.replaceAll("(?i)" + m.group(), " v__" + multitokens.get(m.group().trim())[0] + " ");
                timex_pattern = timex_pattern.replaceAll("(?i)" + m.group(), " " + multitokens.get(m.group().trim())[1] + " ");
            }

            for (String phraselist : this.phraselists.keySet()) {
                p = Pattern.compile(" " + (this.phraselists.get(phraselist).getRE()).replaceAll("\\\\\\\\", "\\\\") + " "); //, Pattern.CASE_INSENSITIVE this must be handled with a parameter (default insensitive, all lowercap)
                m = p.matcher(timex_normtext);
                while (m.find()) {
                    timex_normtext = timex_normtext.replaceAll("(?i)" + m.group(), " v__" + this.phraselists.get(phraselist).getMapValue(m.group().trim()) + " ");
                    timex_pattern = timex_pattern.replaceAll("(?i)" + m.group(), " " + this.phraselists.get(phraselist).getName() + " ");
                }
            }
        } catch (Exception e) {
            System.err.println("Errors found:\n\t" + e.toString() + "\n");
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            }
            return null;
        }

        return (timex_normtext.trim().replaceAll("v__", "") + "|" + timex_pattern.trim());

    }
}