Numek.java example

Explorer

cognitionis-nlp-libraries-master
- external-tools
  - src
    - main
      - java
        com
        cognitionis
        external_tools
        CRF.java
        CoNLL_scorer.java
        FreeLing.java
        Main.java
        MaltParser.java
        SRL_Roth.java
        SVM.java
        TempEval_scorer.java
        Tokenizer_TreeTagger.java
        TreeTagger.java
        WNInterface.java
- feature-builder
  - src
    - main
      - java
        com
        cognitionis
        feature_builder
        BaseTokenFeatures.java
        CategorizationTE2.java
        Classification.java
        Main.java
        Timen.java
        TimexNormalization.java
- jtimegraph
  - src
    - main
      - java
        com
        cognitionis
        jtimegraph
        Main.java
        gregoriangraph
        GregorianGraph.java
        GregorianPoint.java
        timegraph
        Chain.java
        TimeGraph.java
        TimePoint.java
- knowledgek
  - src
    - main
      - java
        com
        cognitionis
        knowledgek
        Main.java
        NUMEK
        NUMEK.java
        TIMEK
        TIMEK.java
        VerbAttributesK.java
- nlp-files
  - src
    - main
      - java
        com
        cognitionis
        nlp_files
        LengthAlphabeticalComparator.java
        Main.java
        NLPFile.java
        NgramHandler.java
        PhraselistFile.java
        PipesFile.java
        PlainFile.java
        RegexPhraselistFile.java
        Stat.java
        TabFile.java
        TempEvalFiles.java
        TokenizedFile.java
        TokenizedPerSentenceFile.java
        TransduceRulelistFile.java
        TreebankFile.java
        XMLFile.java
        annotation_scorers
        Judgement.java
        Scomp.java
        Score.java
        Scorer.java
        parentical_parsers
        SRLColParser.java
        SyntColParser.java
        SyntColSBarTMPRoleParser.java
- nlp-knowledge
  - src
    - main
      - java
        com
        cognitionis
        nlp_knowledge
        Main.java
        numbers
        Numek.java
        time
        Timek.java
        TimexNormalizer.java
        TimexResolver.java
    - test
      - java
        com
        cognitionis
        nlp_knowledge
        numbers
        NumekTest.java
        time
        TimekTest.java
        TimexNormalizerTest.java
- nlp-lang-models
  - src
    - main
      - java
        com
        cognitionis
        nlp_lang_models
        Main.java
        TextCategorizer.java
        TextCategorizerFingerprint.java
    - test
      - java
        com
        cognitionis
        nlp_lang_models
        TextCategorizerTest.java
- nlp-segmentation
  - src
    - main
      - java
        com
        cognitionis
        nlp_segmentation
        Aligner.java
        Main.java
        SentSplit.java
        Tokenizer_PTB_Rulebased.java
    - test
      - java
        com
        cognitionis
        nlp_segmentation
        TokenizerTest.java
- nlp-taggers
  - src
    - main
      - java
        com
        cognitionis
        nlp_taggers
        Baseline_MostFrequentTag.java
        HMM.java
        Main.java
        Tagger.java
- nlpbt
  - src
    - main
      - java
        com
        cognitionis
        nlpbt
        Main.java
- timeml-basickit
  - src
    - main
      - java
        com
        cognitionis
        timeml_basickit
        Element.java
        Event.java
        Link.java
        Main.java
        TML_file_utils.java
        TimeML.java
        TimeReference.java
        Timex.java
        comparators
        AscINT_eiid_Comparator.java
        AscINT_lid_Comparator.java
        AscStringTimeRefMapComparator.java
        AscStringTimexMapComparator.java
- utils-basickit
  - src
    - main
      - java
        com
        cognitionis
        utils_basickit
        AscStringIntMapComparator.java
        DateUtils.java
        DescStringIntMapComparator.java
        DescStringIntMapEntryListComparator.java
        FileUtils.java
        Main.java
        MapUtils.java
        SAXReader.java
        StringUtils.java
        Xml2PlainHandler.java
        XmlAttribs.java
        statistics
        T_test.java
- wiki-basickit
  - src
    - main
      - java
        com
        cognitionis
        wiki_basickit
        DBpedia_bk.java
        Main.java
        WikiHtml2PlainESHandler.java
        WikiHtml2PlainHandler.java
        Wiki_bk.java

package com.cognitionis.nlp_knowledge.numbers;

import com.cognitionis.nlp_files.PhraselistFile;
import com.cognitionis.utils_basickit.FileUtils;
import java.io.File;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Numek {

    // this could probably be an abstract class
    public Locale locale;
    public HashSet<String> all_keys;
    public HashSet<String> repeated_keys;
    public PhraselistFile ambiguous;
    // this could probably be a Map of phraselists (dynamic)
    public PhraselistFile delimiters;
    public PhraselistFile units;
    public PhraselistFile tens;
    public PhraselistFile irregular_tens;
    public PhraselistFile magnitudes;
    public PhraselistFile special_groups;
    public PhraselistFile restrictions;
    public PhraselistFile ordinal_units;
    public PhraselistFile ordinal_irregular_tens;
    public PhraselistFile ordinal_tens;
    public PhraselistFile ordinal_suffixes;
    public PhraselistFile decimal_point_separator;
    public PhraselistFile group_separators;

    //Number class is needed to store values objects by reference
    private class Number {

        public Double value;

        public Number() {
            value = 0.0;
        }

        public Number(Double v) {
            value = v;
        }
    }
    // Magnitude is whatever can have numbers or lower order magnitudes on the left
    // A number is whatever distinct to magnitude that cannot be operated with magnitudes on the left
    public final static Integer MAX_NUMBERS_ORDER = 99;  // Explicit numbers (one token) MAX order of magnitude
    // Roman numbers are static since they are not going to change
    public static String romans = "IVXLCDM";
    public static String romans5 = "VLD";
    public final static HashMap<String, Integer> romansMap = new HashMap<>();

    static {
        romansMap.put("I", 1);
        romansMap.put("V", 5);
        romansMap.put("X", 10);
        romansMap.put("L", 50);
        romansMap.put("C", 100);
        romansMap.put("D", 500);
        romansMap.put("M", 1000);
    }

    public Numek() {
        this(new Locale("en", "US")); // default language
    }

    public Numek(Locale l) {
        this(l,"resources"); // default resources location
    }

    public Numek(Locale l, String resources_dir) {
        locale = l;
        String lang = l.toString().replace('_', '-');
        String shortlang =lang.substring(0, 2);
        String resource_separator=File.separator;
        all_keys = new HashSet<>();
        repeated_keys = new HashSet<>();
        try {
            String res_path=FileUtils.getResourcesPath(Numek.class, resources_dir + File.separator + "numbers" + File.separator);
            if(res_path.contains("/")){
                resource_separator="/";
            }           
            if (!FileUtils.URL_exists(res_path + lang + resource_separator)) {
                res_path = res_path + shortlang + resource_separator;
            } else {
                res_path = res_path + lang + resource_separator;
            }
            
            if (!FileUtils.URL_exists(res_path)) {
                throw new Exception("Not-supported locale: " + lang + " nor " +shortlang);
            } else {
                // this can be done dynamically given .conf json file (requiring specific files...)
                // I understand more why VLINGO is how it is
                // This then could be a FOR loop
                if (FileUtils.URL_exists(res_path + "ambiguous.phraselist")) {
                    ambiguous = new PhraselistFile(res_path + "ambiguous.phraselist",false, locale,true,true,true);
                }
                delimiters = new PhraselistFile(res_path + "delimiters.phraselist", false, locale,false,false,false);
                all_keys.addAll(delimiters.keySet());
                units = new PhraselistFile(res_path + "units.phraselist", false, locale,true,false,false);
                repeated_keys.addAll(units.intersectPhraselist(all_keys));
                all_keys.addAll(units.keySet());
                tens = new PhraselistFile(res_path + "tens.phraselist", false, locale,true,false,false);
                repeated_keys.addAll(tens.intersectPhraselist(all_keys));
                all_keys.addAll(tens.keySet());
                magnitudes = new PhraselistFile(res_path + "magnitudes.phraselist", false, locale,true,false,false);
                repeated_keys.addAll(magnitudes.intersectPhraselist(all_keys));
                all_keys.addAll(magnitudes.keySet());
                decimal_point_separator = new PhraselistFile(res_path + "decimal_point_separator.phraselist", false, locale,false,false,false);
                repeated_keys.addAll(decimal_point_separator.intersectPhraselist(all_keys));
                all_keys.addAll(decimal_point_separator.keySet());
                ordinal_units = new PhraselistFile(res_path + "ordinal_units.phraselist", false, locale,true,false,false);
                repeated_keys.addAll(ordinal_units.intersectPhraselist(all_keys));
                all_keys.addAll(ordinal_units.keySet());
                if (FileUtils.URL_exists(res_path + "irregular_tens.phraselist")) {
                    irregular_tens = new PhraselistFile(res_path + "irregular_tens.phraselist", false, locale,true,false,false);
                    repeated_keys.addAll(irregular_tens.intersectPhraselist(all_keys));
                    all_keys.addAll(irregular_tens.keySet());
                }
                if (FileUtils.URL_exists(res_path + "special_groups.phraselist")) {
                    special_groups = new PhraselistFile(res_path + "special_groups.phraselist", false, locale,true,false,false);
                    repeated_keys.addAll(special_groups.intersectPhraselist(all_keys));
                    all_keys.addAll(special_groups.keySet());
                }
                if (FileUtils.URL_exists(res_path + "group_separators.phraselist")) {
                    group_separators = new PhraselistFile(res_path + "group_separators.phraselist", false, locale, false,false,false);
                    repeated_keys.addAll(group_separators.intersectPhraselist(all_keys));
                    all_keys.addAll(group_separators.keySet());
                }
                if (FileUtils.URL_exists(res_path + "ordinal_irregular_tens.phraselist")) {
                    ordinal_irregular_tens = new PhraselistFile(res_path + "ordinal_irregular_tens.phraselist", false, locale);
                    repeated_keys.addAll(ordinal_irregular_tens.intersectPhraselist(all_keys));
                    all_keys.addAll(ordinal_irregular_tens.keySet());
                }
                if (FileUtils.URL_exists(res_path + "ordinal_tens.phraselist")) {
                    ordinal_tens = new PhraselistFile(res_path + "ordinal_tens.phraselist", false, locale,true,false,false);
                    repeated_keys.addAll(ordinal_tens.intersectPhraselist(all_keys));
                    all_keys.addAll(ordinal_tens.keySet());
                }
                if (FileUtils.URL_exists(res_path + "ordinal_suffixes.phraselist")) {
                    ordinal_suffixes = new PhraselistFile(res_path + "ordinal_suffixes.phraselist", false, locale,false,false,false);
                    repeated_keys.addAll(ordinal_suffixes.intersectPhraselist(all_keys));
                    all_keys.addAll(ordinal_suffixes.keySet());
                }
                if (FileUtils.URL_exists(res_path + "restrictions.phraselist")) {
                    restrictions = new PhraselistFile(res_path + "restrictions.phraselist", false, locale,true,false,false); // do not count for ambiguity
                }
                if(ambiguous!=null){
                    for (String akey : ambiguous.keySet()) {
                        HashSet<String> temp_keys = new HashSet<>(repeated_keys);
                        for (String key : repeated_keys) {
                            if (akey.contains(key)) {
                                temp_keys.remove(key);
                            }
                        }
                        repeated_keys.clear();
                        repeated_keys.addAll(temp_keys);
                    }
                }
                if (!repeated_keys.isEmpty()) {
                    // there should be a way to check if there is an ambiguity variable (all must have a config.json file)
                    throw new Exception("This knowledge element has unhandled ambiguity: " + repeated_keys);
                }
                // part modifiers are not used yet because more resarch is needed... can be language dependent
                // all_keys.contains(l);

            }
        } catch (Exception e) {
            System.err.println("Errors found in " + this.getClass().getName() + ":\n\t" + e.toString());
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
            }
        }
    }

    /**
     * Returns value of a fraction (e.g., 1/2 --> 0.5) or the sum of a number
     * and a fraction (e.g., 2 1/2 --> 2.5).
     *
     * @param snumber string number (separator is whitespace " ")
     *
     * @return result
     */
    public static Double calc_and_sum_frac(String snumber) {
        if (snumber.matches("([0-9]+ )?[0-9]+/[0-9]+")) {
            String[] temp;
            if (snumber.contains(" ")) {
                temp = snumber.split(" ");
                return Double.parseDouble(temp[0]) + Double.parseDouble(temp[1].substring(0, temp[1].indexOf('/'))) / Double.parseDouble(temp[1].substring(temp[1].indexOf('/') + 1));
            } else {
                return Double.parseDouble(snumber.substring(0, snumber.indexOf('/'))) / Double.parseDouble(snumber.substring(snumber.indexOf('/') + 1));
            }
        } else {
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                System.err.println("Warining: error normalizing fraction (" + snumber + ") it has been set to 0.0 by default.");
            }
            return 0.0;
        }
    }

    /**
     * Converts a spelled number to numeric (twenty -> 20) If the input is
     * numeric it is normalized. (010.0540 -> 10.054) If the input is not
     * normalizable: eight eight, then the same text is returned
     *
     * @param snumber
     * @return
     */
    public String text2number(String snumber) {
        // UK  "and" hundreds/thousands/tens/units separator "five hundred and six"
        // UK/US  "-" is tens and units separator
        // UK magnitudes are singular. Plural (milions) is informal/indefinite (e.g., there were millions of people)
        // eleven hundred == 1100 (correct in "informal" English)
        Number number = new Number(); // We cannot use Double because objects are not saved by reference
        try {

            // BASIC CLEANUP
            snumber = snumber.trim().toLowerCase(locale); //.replaceAll("\\s+(st|nd|rd|th|o|a)$", "").replaceAll("ord__", "");
            //snumber = snumber.replaceAll("^(?:a|an)\\s+((?:" + units.getRE() + "|" + tens.getRE() + "|" + irregular_tens.getRE() + ").*)$", "$1");

            Integer magnitude = 0; // order of magnitude
            Integer max_magn = MAX_NUMBERS_ORDER;
            String[] elements = snumber.split("(\\s*-\\s*|\\s+" + delimiters.getRE() + "\\s+|\\s+)"); // we need to add - because we cannot ensure space separation
            //System.out.println(snumber + " - numelems: "+elements.length);

            //remove group separator
            if (group_separators != null) {
                snumber = snumber.replaceAll(group_separators.getRE(), "");
            }
            //replace decimal separators by . (standardize to en-US)
            if (decimal_point_separator != null) {
                snumber = snumber.replaceAll(decimal_point_separator.getRE(), ".");
            }

            // NORMALIZE NUMERIC INPUTS
            if (snumber.matches("(-)?[0-9]+(\\.[0-9]+)?")) {
                return prettyFormat(Double.parseDouble(snumber));
            }
            if (snumber.matches("(-)?[0-9]+%")) {
                return "" + (Double.parseDouble(snumber.substring(0, snumber.length() - 1)) / 100);
            }

            if (snumber.matches("([0-9]+ )?[0-9]+/[0-9]+")) {
                return "" + calc_and_sum_frac(snumber);
            }

            // CHECK IF WE KNOW ALL THE ELEMENTS OF THE INPUT STRING
            for (int i = 0; i < elements.length; i++) {
                //if (i == 0 && !elements[i].matches("[0-9]+") && !units.getMap().containsKey(elements[i]) && !tens.getMap().containsKey(elements[i]) && !irregular_tens.getMap().containsKey(elements[i]) && !magnitudes.getMap().containsKey(elements[i]) && !ordinal_units.getMap().containsKey(elements[i]) && !special_groups.getMap().containsKey(elements[i])) {
                if (i == 0 && !elements[i].matches("[0-9]+("+ordinal_suffixes.getRE()+")?") && !all_keys.contains(elements[i])) {
                    throw new Exception("Unknown element (0): " + elements[i] + " in " + snumber);
                }
                //if (i != 0 && !units.getMap().containsKey(elements[i]) && !tens.getMap().containsKey(elements[i]) && !irregular_tens.getMap().containsKey(elements[i]) && !magnitudes.getMap().containsKey(elements[i]) && !special_groups.getMap().containsKey(elements[i])) {
                if (i != 0 && !all_keys.contains(elements[i])) {
                    throw new Exception("Unknown element (" + i + "): " + elements[i] + " in " + snumber);
                }
            }

            // ordinals
            if (elements.length == 1 && ordinal_units.getMap().containsKey(elements[0])) {
                return "" + ordinal_units.getMapValue(elements[0]);
            }
            if (ordinal_irregular_tens!=null && elements.length == 1 && ordinal_irregular_tens.getMap().containsKey(elements[0])) {
                return "" + ordinal_irregular_tens.getMapValue(elements[0]);
            }

            if (ordinal_tens!=null && elements.length == 2 && ordinal_tens.getMap().containsKey(elements[0]) && ordinal_units.getMap().containsKey(elements[1])) {
                return "ord__" + (Integer.parseInt(ordinal_tens.getMapValue(elements[0]).replace("ord__", ""))+Integer.parseInt(ordinal_units.getMapValue(elements[1]).replace("ord__", "")));
            }
            if(snumber.matches("[0-9]+\\s*"+ordinal_suffixes.getRE())){
                return "ord__"+snumber.replaceAll(ordinal_suffixes.getRE(), "");
            }
            
            
            // num grup expression
            if (snumber.matches(".*" + special_groups.getRE() + ".*")) {
                if (elements.length == 1 && special_groups.getMap().containsKey(elements[0])) {
                    return "" + special_groups.getMapValue(elements[0]);
                }
                if (elements.length == 2 && units.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1])) {
                    return "" + (Integer.parseInt(units.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])));
                }
                if (elements.length == 2 && irregular_tens.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1])) {
                    return "" + (Integer.parseInt(irregular_tens.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])));
                }
                if (elements.length == 2 && tens.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1])) {
                    return "" + (Integer.parseInt(tens.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])));
                }
                if (elements.length == 2 && special_groups.getMap().containsKey(elements[0]) && magnitudes.getMap().containsKey(elements[1])) {
                    return "" + (Integer.parseInt(special_groups.getMapValue(elements[0])) * Integer.parseInt(magnitudes.getMapValue(elements[1])));
                }
                if (elements.length == 3 && units.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1]) && magnitudes.getMap().containsKey(elements[2])) {
                    return "" + (Integer.parseInt(units.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])) * Integer.parseInt(magnitudes.getMapValue(elements[2])));
                }
            }



            // date spelled number (nineteen eighty == 1980, 2010, 1919,2081,1981). Specific to English?
            /*if (locale.getLanguage().equals("en") && (elements.length == 2 || elements.length == 3) && (irregular_tens.getMap().containsKey(elements[0]) || tens.getMap().containsKey(elements[0])) && !magnitudes.getMap().containsKey(elements[1]) && (tens.getMap().containsKey(elements[1]) || irregular_tens.getMap().containsKey(elements[1]))) {
             Integer value = 0;
             if (irregular_tens.getMap().containsKey(elements[0])) {
             value = Integer.parseInt(irregular_tens.getMapValue(elements[0])) * 100;
             } else {
             value = Integer.parseInt(tens.getMapValue(elements[0])) * 100;
             }
             if (irregular_tens.getMap().containsKey(elements[1])) {
             value += Integer.parseInt(irregular_tens.getMapValue(elements[1]));
             } else {
             value += Integer.parseInt(tens.getMapValue(elements[1]));
             }
             if (elements.length == 3 && units.getMap().containsKey(elements[2])) {
             value += Integer.parseInt(units.getMapValue(elements[2]));
             }
             return "" + value;
             }*/

            // number + magnitude: 20 million
            if (elements[0].matches("([0-9]+|([0-9]*.[0-9]+))")) {
                if (elements.length != 2) {
                    if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                        System.err.println("(UNDER CONSTRUCTION) Only the number and the first magnitude will be normalized: " + snumber);
                    }
                }
                Integer magn = Integer.parseInt(magnitudes.getMapValue(elements[1]));
                if (magn == null) {
                    if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                        System.err.println("In [0-9]+ magnitude numbers the second component must be a valid magnitude. Found: " + elements[1] + "   from: " + snumber);
                    }
                    magn = 1;
                }
                // TODO provar a ver si va .5
                return prettyFormat((Double.parseDouble(elements[0]) * magn));


            } else { // regular spelled number
                // the number (Number) object increases while i increases from units to the highest magnitude-unit pair
                Integer i = elements.length - 1; // i means the analyzed position from units to the highest magnitude-unit pair
                while (i >= 0) {
                    String element = elements[i].trim();
                    magnitude = null;
                    if (magnitudes.getMapValue(element) != null) {
                        magnitude = Integer.parseInt(magnitudes.getMapValue(element));
                    }
                    //System.err.println("i=" + i + " - " + element);
                    if (magnitude != null) {
                        //operate magnitude
                        if (magnitude <= (((int) Math.pow(10, number.value.toString().substring(0, number.value.toString().lastIndexOf('.')).length()) - 1))) {
                            throw new Exception("Greater magnitude expected in " + element + " (" + snumber + ")");
                        }
                        i = operateMagnitude(elements, i, number);
                    } else {
                        // operate number (only at rightest position)
                        if (i != (elements.length - 1)) {
                            // TODO no exception but warning..., 
                            throw new Exception("Unexpected number when looking for a magnitude. Found: " + element + " (" + snumber + ")");
                        }
                        i = getNumber(elements, i, number, max_magn);
                    }
                }
            }
        } catch (Exception e) {
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                System.err.println("Errors found (NUMEK):\n\t" + e.toString());
                e.printStackTrace(System.err);
                System.exit(1);
            }
            //return snumber.replaceAll(" ", "-"); // not null nor textual string because it breaks the application
            /*if (number.value != null && number.value > 0) {
             return prettyFormat(number.value);
             } else {*/  // we should not return partial values
            return snumber;
            //}
        }

        // remove useless decimals
        return prettyFormat(number.value);
    }

    private String prettyFormat(Double numeknum) {
        //This is the correct use but will break for very big/small numbers, we can loose precision, but is shorter than the alternative
        return cleanNumberFormat(String.format(Locale.ENGLISH, "%1$.7f", numeknum));
        /*if (numeknum < 0) {
         return "-" + prettyFormat(-numeknum);
         }
         String snumeknum = String.valueOf(numeknum);
         int indexOfE = snumeknum.indexOf("E");
         if (indexOfE == -1) {
         return snumeknum;
         }
         StringBuilder sb = new StringBuilder();
         if (numeknum > 1) {//big number
         int exp = Integer.parseInt(snumeknum.substring(indexOfE + 1));
         String sciDecimal = snumeknum.substring(2, indexOfE);
         int sciDecimalLength = sciDecimal.length();
         if (exp == sciDecimalLength) {
         sb.append(snumeknum.charAt(0));
         sb.append(sciDecimal);
         } else if (exp > sciDecimalLength) {
         sb.append(snumeknum.charAt(0));
         sb.append(sciDecimal);
         for (int i = 0; i < exp - sciDecimalLength; i++) {
         sb.append('0');
         }
         } else if (exp < sciDecimalLength) {
         sb.append(snumeknum.charAt(0));
         sb.append(sciDecimal.substring(0, exp));
         sb.append('.');
         for (int i = exp; i < sciDecimalLength; i++) {
         sb.append(sciDecimal.charAt(i));
         }
         }
         return sb.toString();
         } else { //for little numbers use the default or you will loose accuracy
         return snumeknum;
         }*/
    }

    private String cleanNumberFormat(String numeknum) {
        //numeknum=numeknum.replace(",", ""); // not needed since we specify it in string format
        // The pure would be use DecimalFormat in pretty format but...
        //if (numeknum.matches(".*\\.(0)+")) { numeknum = numeknum.substring(0, numeknum.lastIndexOf('.'));  }
        numeknum = numeknum.indexOf(".") < 0 ? numeknum : numeknum.replaceAll("0*$", "").replaceAll("\\.$", ""); // 2 replace all are needed
        return numeknum;
    }

    private Integer getNumber(String[] elements, Integer i, Number number, Integer maxorder) {
        Integer ret = i;
        try {
            if (maxorder > MAX_NUMBERS_ORDER) {
                maxorder = MAX_NUMBERS_ORDER;
            }
            if (maxorder <= 0) {
                throw new Exception("Order not in correct range : " + maxorder);
            }
            if (maxorder <= 10) {
                ret = ret - look4units(elements[i], number);
            }
            if (maxorder > 10) {
                ret = i - look4irregular_tens(elements[i], number);
                if (ret == i) {
                    ret = i - look4tens(elements[i], number);
                    if (ret == i) {
                        ret = ret - look4units(elements[i], number);
                        if (ret == i) {
                            throw new Exception("Malformed number: " + elements[i]);
                        }
                        if (i > 0) {
                            ret = ret - look4tens(elements[i - 1], number);
                        }
                    }
                }
            }
            if (ret == i) {
                throw new Exception("Malformed number, unexected (max order " + maxorder + "): " + elements[i]);
            }

        } catch (Exception e) {
            System.err.println("Errors found (NUMEK):\n\t" + e.toString());
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                //System.exit(1);
            }
            return -1;
        }
        return ret;
    }

    private Integer operateMagnitude(String[] elements, Integer i, Number number) {
        try {
            Number magnumber = new Number();
            //Double magnumber = 0.0;
            String magnitude = elements[i].trim();
            Integer magnvalue = Integer.parseInt(magnitudes.getMapValue(magnitude));
            if (!magnitudes.getMap().containsKey(magnitude)) {
                throw new Exception("Expected magnitude, found " + magnitude);
            }
            //System.err.println("Magnitude: " + magnitude);
            Integer maxmagn = magnvalue - 1;

            if (restrictions.getMap().containsKey(magnitude)) {
                maxmagn = Integer.parseInt(restrictions.getMapValue(magnitude));
            }

            i--;

            if (i >= 0) {
                Integer currentmagn = 0;
                while (i >= 0) {
                    // if there is a number and it can be operated it then operate it
                    if (!magnitudes.getMap().containsKey(elements[i])) {
                        if (currentmagn > MAX_NUMBERS_ORDER) {
                            throw new Exception("Expected magnitude, found " + elements[i]); //instead of this, normalize in n-parts six six six --> 6 6 6
                        }
                        i = getNumber(elements, i, magnumber, maxmagn);
                        currentmagn = ((int) Math.pow(10, number.value.toString().substring(0, number.value.toString().lastIndexOf('.')).length()));
                    } else {
                        // If magnitudes can be eelements[i]xpected for the current magnitude
                        if (Integer.parseInt(magnitudes.getMapValue(elements[i])) < maxmagn) {
                            i = operateMagnitude(elements, i, magnumber);
                        } else {  // resolver magnitud present y subir arriba para seguir operando
                            //if no value for magnitude go by
                            if (magnumber.value == 0L) {
                                magnumber.value = 1.0;
                            }
                            // finally operate whatever and break
                            number.value += magnumber.value * magnvalue;
                            magnumber.value = 0.0;
                            break;
                        }
                    }
                }
                if (magnumber.value != 0L) {
                    number.value += magnumber.value * magnvalue;
                }
            } else {
                magnumber.value = 1.0;
                number.value += magnumber.value * magnvalue;
                i--;
            }
        } catch (Exception e) {
            System.err.println("Errors found (NUMEK):\n\t" + e.toString());
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                //System.exit(1);
            }
            return -1;
        }
        return i;

    }

    private Integer look4units(String element, Number number) {
        if (units.getMapValue(element) != null) {
            number.value += Integer.parseInt(units.getMapValue(element));
            return 1;
        }
        return 0;
    }

    private Integer look4irregular_tens(String element, Number number) {
        if (irregular_tens.getMapValue(element) != null) {
            number.value += Integer.parseInt(irregular_tens.getMapValue(element));
            return 1;
        }
        return 0;
    }

    private Integer look4tens(String element, Number number) {
        if (tens.getMapValue(element) != null) {
            number.value += Integer.parseInt(tens.getMapValue(element));
            return 1;
        }
        return 0;
    }


  
    /**
     * Returns an unambiguous semi-text|semi-pattern NOTE: Ambiguous RE must not
     * contain multi-word replacements
     *
     * @param text
     *
     * @return String: unambiguous text (numbers)
     */
    public final String disambiguate(String pattern) {
        String pat = pattern;
        if (ambiguous != null) {
            Pattern pa = Pattern.compile(ambiguous.getRE(), Pattern.CASE_INSENSITIVE);
            Matcher ma = pa.matcher(pat);
            if (ma.find()) {
                for (String key : ambiguous.getMap().keySet()) {
                    Pattern ambig = Pattern.compile(key, Pattern.CASE_INSENSITIVE);
                    Matcher m = ambig.matcher(pat);
                    if (m.find()) {
                        pat = pat.replaceAll(key, ambiguous.getMapValue(key)); // in the file use TIgnore to force decoding in a subsequent step
                        // NO NEED TO REPLACE TEXT (because value can merge multi tokens
                        //THE m.group has to be normalized
                    }
                }
            }
        }
        return pat;

    }
    
    
    
    /**
     * Obtains the Numek normalized text (NormText) and Patter from a given text
     * (c_card and c_ord). Ambiguities must be pre-resolved
     *
     * @param text input text (by default it is case insensitive)
     * @param pattern
     * @return the feature-values (i.e., normtext|pattern)
     */
    public String getNormTextandPattern(String text, String pattern) {
        String normtext = "";
        try {            
            pattern=this.disambiguate(pattern);
            String[] textarr = text.trim().split(" ");
            String[] patternarr = pattern.trim().split(" ");
            pattern = ""; // reset to build

            // check Nums and magnitudes (e.g., one million or 25 hundred), if after [0-9] there is no spell leave as it is.
            String multitokenNum = "";
            String multitokenType = ""; // can be c_card or c_ord
            String currentPat="";

            // Lookup each token in the text in order
            for (int i = 0; i < textarr.length; i++) {

                // Establish the current pattern
                if (patternarr[i].startsWith("c_") || textarr[i].startsWith("v__")) {
                    currentPat = patternarr[i]; // if there is already a pattern keep it
                } else {
                    // cardinals or cardinal delimiters for initialized spelled nums
                    if (textarr[i].matches("([0-9]+(?:\\.[0-9]+)?|" + units.getRE() + "|" + tens.getRE() + "|" + irregular_tens.getRE() + "|" + magnitudes.getRE() + "|" + special_groups.getRE() + "|" + units.getRE() + "|" + tens.getRE() + "-" + units.getRE() + ")")
                            || (textarr[i].matches(delimiters.getRE()) && multitokenType.equals("c_card") && !multitokenNum.equals("") && !multitokenNum.matches(".*([0-9]).*"))) {
                        currentPat = "c_card";
                    } else {
                        if (textarr[i].matches("[0-9]+"+ordinal_suffixes.getRE()) || textarr[i].matches(ordinal_units.getRE()) || (ordinal_irregular_tens!=null && textarr[i].matches(ordinal_irregular_tens.getRE())) || (ordinal_tens!=null&&textarr[i].matches(ordinal_tens.getRE()+"(-"+ordinal_units.getRE()+")?"))) {
                            currentPat = "c_ord";
                        } else {
                            currentPat = textarr[i];
                        }
                    }
                }



                // check if a multitokenNum ends, if the current token/pattern cannot be combined with the current type
                if (!multitokenNum.equals("")
                        && ((!currentPat.equals(multitokenType)
                        || textarr[i].matches("[0-9]+(?:\\.[0-9]+)?")
                        || multitokenNum.matches(ordinal_units.getRE()))
                        || (currentPat.equals(multitokenType) && (text2number(multitokenNum + " " + textarr[i])).equals(multitokenNum.trim() + " " + textarr[i])))) {
                    normtext += " v__" + text2number(multitokenNum.trim());
                    pattern += " " + multitokenType;
                    multitokenNum = ""; // initialize
                }
                // add to normTE or to spelled num
                if (currentPat.equals(multitokenType) || multitokenType.equals("") && (currentPat.equals("c_card") ||currentPat.equals("c_ord"))) {
                    multitokenNum += " " + textarr[i];
                       multitokenType=currentPat;
                } else { // Month/Week could be replaced by a number BUT SINCE THERE ARE DIFFERENT INTERPRETATIONS it is better to leave them as string
                    normtext += " " + textarr[i];
                    pattern += " " + currentPat;
                }
            }

            // add last spellednum if exists
            if (!multitokenNum.equals("")) {
                normtext += " v__" + text2number(multitokenNum.trim());
                pattern += " " + multitokenType;
            }


        } catch (Exception e) {
            System.err.println("Errors found:\n\t" + e.toString() + "\n");
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
                System.exit(1);
            }
            return null;
        }

        return (normtext.trim() + "|" + pattern.trim());

    }

    /**
     * Counts how many times a char (pattern) appears in a string (source)
     *
     * @param source
     * @param pattern
     * @return
     */
    public static int countOccurrencesOf(String source, char pattern) {
        int count = 0;
        if (source != null) {
            int found = -1;
            int start = 0;
            while ((found = source.indexOf(pattern, start)) != -1) {
                start = found + 1;
                count++;
            }
            return count;
        } else {
            return 0;
        }
    }

    /**
     * Returns the decimal representation of a Roman number This function only
     * works until 3999 since greater numbers use non-ASCII chars (See
     * Wikipedia) Rules: Only 3 consecutive chars, 5-like can not be repeated,
     * only multiples of 10 can be used to subtract (and must subtract only the
     * next two greater values) ...
     *
     * @param roman
     * @return
     */
    public static String Roman2Decimal(String roman) {
        try {
            int dec = 0;
            int ant = 0;
            char ant_letter = '\0';
            if (roman == null || roman.trim().length() == 0) {
                return "0";
            }
            roman = roman.trim().replaceAll("\\s+", "").toUpperCase();
            if (!roman.matches("[" + romans + "]+") || roman.matches(".*(.)\\1{3,}.*") || roman.matches(".*([" + romans5 + "]).*\\1.*")) {
                throw new Exception("Invalid roman number: " + roman + ". Must only contain " + romans + " and not more than 3 consecutive equal chars are allowed, non-10 power numbers (" + romans5 + ") can only appear once. " + roman);
            }
            for (int i = 0; i < roman.length(); i++) {
                char letter = roman.charAt(i);
                int value = romansMap.get("" + letter);
                dec = dec + value;
                if (i > 0 && roman.length() > 2 && i < roman.length() - 1 && (ant <= value && value < romansMap.get("" + roman.charAt(i + 1)))) {
                    throw new Exception("Two consecutive subtractions or more than one equal symbols used to subtract " + roman);
                }
                if (i > 0 && roman.length() > 2 && i < roman.length() - 1 && (ant < value && ant <= romansMap.get("" + roman.charAt(i + 1)))) {
                    throw new Exception("Substracting and adding the same symbol or greater " + roman);
                }

                if (i != 0 && ant < value) { // no need to check if ant is 0 because it means substract nothing
                    double check5 = Math.log10(ant);
                    if (i != 0 && check5 != (int) check5) {
                        throw new Exception("Symbols powers of 5 cannot be used to substract: " + ant);
                    }
                    if (romans.indexOf(letter) - 2 > romans.indexOf(ant_letter)) {
                        throw new Exception("With " + ant_letter + " you can only substract " + romans.substring(romans.indexOf(ant_letter) + 1, romans.indexOf(ant_letter) + 3) + ". Incorrect: " + roman);
                    }
                    dec = dec - ant * 2;
                }
                ant = value;
                ant_letter = letter;
            }
            return "" + dec;
        } catch (Exception e) {
            System.err.println("Errors found (NUMEK):\n\t" + e.toString());
            if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
                e.printStackTrace(System.err);
            }
            return null;
        }

    }
}