package com.cognitionis.nlp_knowledge.numbers;
import com.cognitionis.nlp_files.PhraselistFile;
import com.cognitionis.utils_basickit.FileUtils;
import java.io.File;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Numek {
// this could probably be an abstract class
public Locale locale;
public HashSet<String> all_keys;
public HashSet<String> repeated_keys;
public PhraselistFile ambiguous;
// this could probably be a Map of phraselists (dynamic)
public PhraselistFile delimiters;
public PhraselistFile units;
public PhraselistFile tens;
public PhraselistFile irregular_tens;
public PhraselistFile magnitudes;
public PhraselistFile special_groups;
public PhraselistFile restrictions;
public PhraselistFile ordinal_units;
public PhraselistFile ordinal_irregular_tens;
public PhraselistFile ordinal_tens;
public PhraselistFile ordinal_suffixes;
public PhraselistFile decimal_point_separator;
public PhraselistFile group_separators;
//Number class is needed to store values objects by reference
private class Number {
public Double value;
public Number() {
value = 0.0;
}
public Number(Double v) {
value = v;
}
}
// Magnitude is whatever can have numbers or lower order magnitudes on the left
// A number is whatever distinct to magnitude that cannot be operated with magnitudes on the left
public final static Integer MAX_NUMBERS_ORDER = 99; // Explicit numbers (one token) MAX order of magnitude
// Roman numbers are static since they are not going to change
public static String romans = "IVXLCDM";
public static String romans5 = "VLD";
public final static HashMap<String, Integer> romansMap = new HashMap<>();
static {
romansMap.put("I", 1);
romansMap.put("V", 5);
romansMap.put("X", 10);
romansMap.put("L", 50);
romansMap.put("C", 100);
romansMap.put("D", 500);
romansMap.put("M", 1000);
}
public Numek() {
this(new Locale("en", "US")); // default language
}
public Numek(Locale l) {
this(l,"resources"); // default resources location
}
public Numek(Locale l, String resources_dir) {
locale = l;
String lang = l.toString().replace('_', '-');
String shortlang =lang.substring(0, 2);
String resource_separator=File.separator;
all_keys = new HashSet<>();
repeated_keys = new HashSet<>();
try {
String res_path=FileUtils.getResourcesPath(Numek.class, resources_dir + File.separator + "numbers" + File.separator);
if(res_path.contains("/")){
resource_separator="/";
}
if (!FileUtils.URL_exists(res_path + lang + resource_separator)) {
res_path = res_path + shortlang + resource_separator;
} else {
res_path = res_path + lang + resource_separator;
}
if (!FileUtils.URL_exists(res_path)) {
throw new Exception("Not-supported locale: " + lang + " nor " +shortlang);
} else {
// this can be done dynamically given .conf json file (requiring specific files...)
// I understand more why VLINGO is how it is
// This then could be a FOR loop
if (FileUtils.URL_exists(res_path + "ambiguous.phraselist")) {
ambiguous = new PhraselistFile(res_path + "ambiguous.phraselist",false, locale,true,true,true);
}
delimiters = new PhraselistFile(res_path + "delimiters.phraselist", false, locale,false,false,false);
all_keys.addAll(delimiters.keySet());
units = new PhraselistFile(res_path + "units.phraselist", false, locale,true,false,false);
repeated_keys.addAll(units.intersectPhraselist(all_keys));
all_keys.addAll(units.keySet());
tens = new PhraselistFile(res_path + "tens.phraselist", false, locale,true,false,false);
repeated_keys.addAll(tens.intersectPhraselist(all_keys));
all_keys.addAll(tens.keySet());
magnitudes = new PhraselistFile(res_path + "magnitudes.phraselist", false, locale,true,false,false);
repeated_keys.addAll(magnitudes.intersectPhraselist(all_keys));
all_keys.addAll(magnitudes.keySet());
decimal_point_separator = new PhraselistFile(res_path + "decimal_point_separator.phraselist", false, locale,false,false,false);
repeated_keys.addAll(decimal_point_separator.intersectPhraselist(all_keys));
all_keys.addAll(decimal_point_separator.keySet());
ordinal_units = new PhraselistFile(res_path + "ordinal_units.phraselist", false, locale,true,false,false);
repeated_keys.addAll(ordinal_units.intersectPhraselist(all_keys));
all_keys.addAll(ordinal_units.keySet());
if (FileUtils.URL_exists(res_path + "irregular_tens.phraselist")) {
irregular_tens = new PhraselistFile(res_path + "irregular_tens.phraselist", false, locale,true,false,false);
repeated_keys.addAll(irregular_tens.intersectPhraselist(all_keys));
all_keys.addAll(irregular_tens.keySet());
}
if (FileUtils.URL_exists(res_path + "special_groups.phraselist")) {
special_groups = new PhraselistFile(res_path + "special_groups.phraselist", false, locale,true,false,false);
repeated_keys.addAll(special_groups.intersectPhraselist(all_keys));
all_keys.addAll(special_groups.keySet());
}
if (FileUtils.URL_exists(res_path + "group_separators.phraselist")) {
group_separators = new PhraselistFile(res_path + "group_separators.phraselist", false, locale, false,false,false);
repeated_keys.addAll(group_separators.intersectPhraselist(all_keys));
all_keys.addAll(group_separators.keySet());
}
if (FileUtils.URL_exists(res_path + "ordinal_irregular_tens.phraselist")) {
ordinal_irregular_tens = new PhraselistFile(res_path + "ordinal_irregular_tens.phraselist", false, locale);
repeated_keys.addAll(ordinal_irregular_tens.intersectPhraselist(all_keys));
all_keys.addAll(ordinal_irregular_tens.keySet());
}
if (FileUtils.URL_exists(res_path + "ordinal_tens.phraselist")) {
ordinal_tens = new PhraselistFile(res_path + "ordinal_tens.phraselist", false, locale,true,false,false);
repeated_keys.addAll(ordinal_tens.intersectPhraselist(all_keys));
all_keys.addAll(ordinal_tens.keySet());
}
if (FileUtils.URL_exists(res_path + "ordinal_suffixes.phraselist")) {
ordinal_suffixes = new PhraselistFile(res_path + "ordinal_suffixes.phraselist", false, locale,false,false,false);
repeated_keys.addAll(ordinal_suffixes.intersectPhraselist(all_keys));
all_keys.addAll(ordinal_suffixes.keySet());
}
if (FileUtils.URL_exists(res_path + "restrictions.phraselist")) {
restrictions = new PhraselistFile(res_path + "restrictions.phraselist", false, locale,true,false,false); // do not count for ambiguity
}
if(ambiguous!=null){
for (String akey : ambiguous.keySet()) {
HashSet<String> temp_keys = new HashSet<>(repeated_keys);
for (String key : repeated_keys) {
if (akey.contains(key)) {
temp_keys.remove(key);
}
}
repeated_keys.clear();
repeated_keys.addAll(temp_keys);
}
}
if (!repeated_keys.isEmpty()) {
// there should be a way to check if there is an ambiguity variable (all must have a config.json file)
throw new Exception("This knowledge element has unhandled ambiguity: " + repeated_keys);
}
// part modifiers are not used yet because more resarch is needed... can be language dependent
// all_keys.contains(l);
}
} catch (Exception e) {
System.err.println("Errors found in " + this.getClass().getName() + ":\n\t" + e.toString());
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
}
}
}
/**
* Returns value of a fraction (e.g., 1/2 --> 0.5) or the sum of a number
* and a fraction (e.g., 2 1/2 --> 2.5).
*
* @param snumber string number (separator is whitespace " ")
*
* @return result
*/
public static Double calc_and_sum_frac(String snumber) {
if (snumber.matches("([0-9]+ )?[0-9]+/[0-9]+")) {
String[] temp;
if (snumber.contains(" ")) {
temp = snumber.split(" ");
return Double.parseDouble(temp[0]) + Double.parseDouble(temp[1].substring(0, temp[1].indexOf('/'))) / Double.parseDouble(temp[1].substring(temp[1].indexOf('/') + 1));
} else {
return Double.parseDouble(snumber.substring(0, snumber.indexOf('/'))) / Double.parseDouble(snumber.substring(snumber.indexOf('/') + 1));
}
} else {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("Warining: error normalizing fraction (" + snumber + ") it has been set to 0.0 by default.");
}
return 0.0;
}
}
/**
* Converts a spelled number to numeric (twenty -> 20) If the input is
* numeric it is normalized. (010.0540 -> 10.054) If the input is not
* normalizable: eight eight, then the same text is returned
*
* @param snumber
* @return
*/
public String text2number(String snumber) {
// UK "and" hundreds/thousands/tens/units separator "five hundred and six"
// UK/US "-" is tens and units separator
// UK magnitudes are singular. Plural (milions) is informal/indefinite (e.g., there were millions of people)
// eleven hundred == 1100 (correct in "informal" English)
Number number = new Number(); // We cannot use Double because objects are not saved by reference
try {
// BASIC CLEANUP
snumber = snumber.trim().toLowerCase(locale); //.replaceAll("\\s+(st|nd|rd|th|o|a)$", "").replaceAll("ord__", "");
//snumber = snumber.replaceAll("^(?:a|an)\\s+((?:" + units.getRE() + "|" + tens.getRE() + "|" + irregular_tens.getRE() + ").*)$", "$1");
Integer magnitude = 0; // order of magnitude
Integer max_magn = MAX_NUMBERS_ORDER;
String[] elements = snumber.split("(\\s*-\\s*|\\s+" + delimiters.getRE() + "\\s+|\\s+)"); // we need to add - because we cannot ensure space separation
//System.out.println(snumber + " - numelems: "+elements.length);
//remove group separator
if (group_separators != null) {
snumber = snumber.replaceAll(group_separators.getRE(), "");
}
//replace decimal separators by . (standardize to en-US)
if (decimal_point_separator != null) {
snumber = snumber.replaceAll(decimal_point_separator.getRE(), ".");
}
// NORMALIZE NUMERIC INPUTS
if (snumber.matches("(-)?[0-9]+(\\.[0-9]+)?")) {
return prettyFormat(Double.parseDouble(snumber));
}
if (snumber.matches("(-)?[0-9]+%")) {
return "" + (Double.parseDouble(snumber.substring(0, snumber.length() - 1)) / 100);
}
if (snumber.matches("([0-9]+ )?[0-9]+/[0-9]+")) {
return "" + calc_and_sum_frac(snumber);
}
// CHECK IF WE KNOW ALL THE ELEMENTS OF THE INPUT STRING
for (int i = 0; i < elements.length; i++) {
//if (i == 0 && !elements[i].matches("[0-9]+") && !units.getMap().containsKey(elements[i]) && !tens.getMap().containsKey(elements[i]) && !irregular_tens.getMap().containsKey(elements[i]) && !magnitudes.getMap().containsKey(elements[i]) && !ordinal_units.getMap().containsKey(elements[i]) && !special_groups.getMap().containsKey(elements[i])) {
if (i == 0 && !elements[i].matches("[0-9]+("+ordinal_suffixes.getRE()+")?") && !all_keys.contains(elements[i])) {
throw new Exception("Unknown element (0): " + elements[i] + " in " + snumber);
}
//if (i != 0 && !units.getMap().containsKey(elements[i]) && !tens.getMap().containsKey(elements[i]) && !irregular_tens.getMap().containsKey(elements[i]) && !magnitudes.getMap().containsKey(elements[i]) && !special_groups.getMap().containsKey(elements[i])) {
if (i != 0 && !all_keys.contains(elements[i])) {
throw new Exception("Unknown element (" + i + "): " + elements[i] + " in " + snumber);
}
}
// ordinals
if (elements.length == 1 && ordinal_units.getMap().containsKey(elements[0])) {
return "" + ordinal_units.getMapValue(elements[0]);
}
if (ordinal_irregular_tens!=null && elements.length == 1 && ordinal_irregular_tens.getMap().containsKey(elements[0])) {
return "" + ordinal_irregular_tens.getMapValue(elements[0]);
}
if (ordinal_tens!=null && elements.length == 2 && ordinal_tens.getMap().containsKey(elements[0]) && ordinal_units.getMap().containsKey(elements[1])) {
return "ord__" + (Integer.parseInt(ordinal_tens.getMapValue(elements[0]).replace("ord__", ""))+Integer.parseInt(ordinal_units.getMapValue(elements[1]).replace("ord__", "")));
}
if(snumber.matches("[0-9]+\\s*"+ordinal_suffixes.getRE())){
return "ord__"+snumber.replaceAll(ordinal_suffixes.getRE(), "");
}
// num grup expression
if (snumber.matches(".*" + special_groups.getRE() + ".*")) {
if (elements.length == 1 && special_groups.getMap().containsKey(elements[0])) {
return "" + special_groups.getMapValue(elements[0]);
}
if (elements.length == 2 && units.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1])) {
return "" + (Integer.parseInt(units.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])));
}
if (elements.length == 2 && irregular_tens.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1])) {
return "" + (Integer.parseInt(irregular_tens.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])));
}
if (elements.length == 2 && tens.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1])) {
return "" + (Integer.parseInt(tens.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])));
}
if (elements.length == 2 && special_groups.getMap().containsKey(elements[0]) && magnitudes.getMap().containsKey(elements[1])) {
return "" + (Integer.parseInt(special_groups.getMapValue(elements[0])) * Integer.parseInt(magnitudes.getMapValue(elements[1])));
}
if (elements.length == 3 && units.getMap().containsKey(elements[0]) && special_groups.getMap().containsKey(elements[1]) && magnitudes.getMap().containsKey(elements[2])) {
return "" + (Integer.parseInt(units.getMapValue(elements[0])) * Integer.parseInt(special_groups.getMapValue(elements[1])) * Integer.parseInt(magnitudes.getMapValue(elements[2])));
}
}
// date spelled number (nineteen eighty == 1980, 2010, 1919,2081,1981). Specific to English?
/*if (locale.getLanguage().equals("en") && (elements.length == 2 || elements.length == 3) && (irregular_tens.getMap().containsKey(elements[0]) || tens.getMap().containsKey(elements[0])) && !magnitudes.getMap().containsKey(elements[1]) && (tens.getMap().containsKey(elements[1]) || irregular_tens.getMap().containsKey(elements[1]))) {
Integer value = 0;
if (irregular_tens.getMap().containsKey(elements[0])) {
value = Integer.parseInt(irregular_tens.getMapValue(elements[0])) * 100;
} else {
value = Integer.parseInt(tens.getMapValue(elements[0])) * 100;
}
if (irregular_tens.getMap().containsKey(elements[1])) {
value += Integer.parseInt(irregular_tens.getMapValue(elements[1]));
} else {
value += Integer.parseInt(tens.getMapValue(elements[1]));
}
if (elements.length == 3 && units.getMap().containsKey(elements[2])) {
value += Integer.parseInt(units.getMapValue(elements[2]));
}
return "" + value;
}*/
// number + magnitude: 20 million
if (elements[0].matches("([0-9]+|([0-9]*.[0-9]+))")) {
if (elements.length != 2) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("(UNDER CONSTRUCTION) Only the number and the first magnitude will be normalized: " + snumber);
}
}
Integer magn = Integer.parseInt(magnitudes.getMapValue(elements[1]));
if (magn == null) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("In [0-9]+ magnitude numbers the second component must be a valid magnitude. Found: " + elements[1] + " from: " + snumber);
}
magn = 1;
}
// TODO provar a ver si va .5
return prettyFormat((Double.parseDouble(elements[0]) * magn));
} else { // regular spelled number
// the number (Number) object increases while i increases from units to the highest magnitude-unit pair
Integer i = elements.length - 1; // i means the analyzed position from units to the highest magnitude-unit pair
while (i >= 0) {
String element = elements[i].trim();
magnitude = null;
if (magnitudes.getMapValue(element) != null) {
magnitude = Integer.parseInt(magnitudes.getMapValue(element));
}
//System.err.println("i=" + i + " - " + element);
if (magnitude != null) {
//operate magnitude
if (magnitude <= (((int) Math.pow(10, number.value.toString().substring(0, number.value.toString().lastIndexOf('.')).length()) - 1))) {
throw new Exception("Greater magnitude expected in " + element + " (" + snumber + ")");
}
i = operateMagnitude(elements, i, number);
} else {
// operate number (only at rightest position)
if (i != (elements.length - 1)) {
// TODO no exception but warning...,
throw new Exception("Unexpected number when looking for a magnitude. Found: " + element + " (" + snumber + ")");
}
i = getNumber(elements, i, number, max_magn);
}
}
}
} catch (Exception e) {
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
System.err.println("Errors found (NUMEK):\n\t" + e.toString());
e.printStackTrace(System.err);
System.exit(1);
}
//return snumber.replaceAll(" ", "-"); // not null nor textual string because it breaks the application
/*if (number.value != null && number.value > 0) {
return prettyFormat(number.value);
} else {*/ // we should not return partial values
return snumber;
//}
}
// remove useless decimals
return prettyFormat(number.value);
}
private String prettyFormat(Double numeknum) {
//This is the correct use but will break for very big/small numbers, we can loose precision, but is shorter than the alternative
return cleanNumberFormat(String.format(Locale.ENGLISH, "%1$.7f", numeknum));
/*if (numeknum < 0) {
return "-" + prettyFormat(-numeknum);
}
String snumeknum = String.valueOf(numeknum);
int indexOfE = snumeknum.indexOf("E");
if (indexOfE == -1) {
return snumeknum;
}
StringBuilder sb = new StringBuilder();
if (numeknum > 1) {//big number
int exp = Integer.parseInt(snumeknum.substring(indexOfE + 1));
String sciDecimal = snumeknum.substring(2, indexOfE);
int sciDecimalLength = sciDecimal.length();
if (exp == sciDecimalLength) {
sb.append(snumeknum.charAt(0));
sb.append(sciDecimal);
} else if (exp > sciDecimalLength) {
sb.append(snumeknum.charAt(0));
sb.append(sciDecimal);
for (int i = 0; i < exp - sciDecimalLength; i++) {
sb.append('0');
}
} else if (exp < sciDecimalLength) {
sb.append(snumeknum.charAt(0));
sb.append(sciDecimal.substring(0, exp));
sb.append('.');
for (int i = exp; i < sciDecimalLength; i++) {
sb.append(sciDecimal.charAt(i));
}
}
return sb.toString();
} else { //for little numbers use the default or you will loose accuracy
return snumeknum;
}*/
}
private String cleanNumberFormat(String numeknum) {
//numeknum=numeknum.replace(",", ""); // not needed since we specify it in string format
// The pure would be use DecimalFormat in pretty format but...
//if (numeknum.matches(".*\\.(0)+")) { numeknum = numeknum.substring(0, numeknum.lastIndexOf('.')); }
numeknum = numeknum.indexOf(".") < 0 ? numeknum : numeknum.replaceAll("0*$", "").replaceAll("\\.$", ""); // 2 replace all are needed
return numeknum;
}
private Integer getNumber(String[] elements, Integer i, Number number, Integer maxorder) {
Integer ret = i;
try {
if (maxorder > MAX_NUMBERS_ORDER) {
maxorder = MAX_NUMBERS_ORDER;
}
if (maxorder <= 0) {
throw new Exception("Order not in correct range : " + maxorder);
}
if (maxorder <= 10) {
ret = ret - look4units(elements[i], number);
}
if (maxorder > 10) {
ret = i - look4irregular_tens(elements[i], number);
if (ret == i) {
ret = i - look4tens(elements[i], number);
if (ret == i) {
ret = ret - look4units(elements[i], number);
if (ret == i) {
throw new Exception("Malformed number: " + elements[i]);
}
if (i > 0) {
ret = ret - look4tens(elements[i - 1], number);
}
}
}
}
if (ret == i) {
throw new Exception("Malformed number, unexected (max order " + maxorder + "): " + elements[i]);
}
} catch (Exception e) {
System.err.println("Errors found (NUMEK):\n\t" + e.toString());
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
//System.exit(1);
}
return -1;
}
return ret;
}
private Integer operateMagnitude(String[] elements, Integer i, Number number) {
try {
Number magnumber = new Number();
//Double magnumber = 0.0;
String magnitude = elements[i].trim();
Integer magnvalue = Integer.parseInt(magnitudes.getMapValue(magnitude));
if (!magnitudes.getMap().containsKey(magnitude)) {
throw new Exception("Expected magnitude, found " + magnitude);
}
//System.err.println("Magnitude: " + magnitude);
Integer maxmagn = magnvalue - 1;
if (restrictions.getMap().containsKey(magnitude)) {
maxmagn = Integer.parseInt(restrictions.getMapValue(magnitude));
}
i--;
if (i >= 0) {
Integer currentmagn = 0;
while (i >= 0) {
// if there is a number and it can be operated it then operate it
if (!magnitudes.getMap().containsKey(elements[i])) {
if (currentmagn > MAX_NUMBERS_ORDER) {
throw new Exception("Expected magnitude, found " + elements[i]); //instead of this, normalize in n-parts six six six --> 6 6 6
}
i = getNumber(elements, i, magnumber, maxmagn);
currentmagn = ((int) Math.pow(10, number.value.toString().substring(0, number.value.toString().lastIndexOf('.')).length()));
} else {
// If magnitudes can be eelements[i]xpected for the current magnitude
if (Integer.parseInt(magnitudes.getMapValue(elements[i])) < maxmagn) {
i = operateMagnitude(elements, i, magnumber);
} else { // resolver magnitud present y subir arriba para seguir operando
//if no value for magnitude go by
if (magnumber.value == 0L) {
magnumber.value = 1.0;
}
// finally operate whatever and break
number.value += magnumber.value * magnvalue;
magnumber.value = 0.0;
break;
}
}
}
if (magnumber.value != 0L) {
number.value += magnumber.value * magnvalue;
}
} else {
magnumber.value = 1.0;
number.value += magnumber.value * magnvalue;
i--;
}
} catch (Exception e) {
System.err.println("Errors found (NUMEK):\n\t" + e.toString());
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
//System.exit(1);
}
return -1;
}
return i;
}
private Integer look4units(String element, Number number) {
if (units.getMapValue(element) != null) {
number.value += Integer.parseInt(units.getMapValue(element));
return 1;
}
return 0;
}
private Integer look4irregular_tens(String element, Number number) {
if (irregular_tens.getMapValue(element) != null) {
number.value += Integer.parseInt(irregular_tens.getMapValue(element));
return 1;
}
return 0;
}
private Integer look4tens(String element, Number number) {
if (tens.getMapValue(element) != null) {
number.value += Integer.parseInt(tens.getMapValue(element));
return 1;
}
return 0;
}
/**
* Returns an unambiguous semi-text|semi-pattern NOTE: Ambiguous RE must not
* contain multi-word replacements
*
* @param text
*
* @return String: unambiguous text (numbers)
*/
public final String disambiguate(String pattern) {
String pat = pattern;
if (ambiguous != null) {
Pattern pa = Pattern.compile(ambiguous.getRE(), Pattern.CASE_INSENSITIVE);
Matcher ma = pa.matcher(pat);
if (ma.find()) {
for (String key : ambiguous.getMap().keySet()) {
Pattern ambig = Pattern.compile(key, Pattern.CASE_INSENSITIVE);
Matcher m = ambig.matcher(pat);
if (m.find()) {
pat = pat.replaceAll(key, ambiguous.getMapValue(key)); // in the file use TIgnore to force decoding in a subsequent step
// NO NEED TO REPLACE TEXT (because value can merge multi tokens
//THE m.group has to be normalized
}
}
}
}
return pat;
}
/**
* Obtains the Numek normalized text (NormText) and Patter from a given text
* (c_card and c_ord). Ambiguities must be pre-resolved
*
* @param text input text (by default it is case insensitive)
* @param pattern
* @return the feature-values (i.e., normtext|pattern)
*/
public String getNormTextandPattern(String text, String pattern) {
String normtext = "";
try {
pattern=this.disambiguate(pattern);
String[] textarr = text.trim().split(" ");
String[] patternarr = pattern.trim().split(" ");
pattern = ""; // reset to build
// check Nums and magnitudes (e.g., one million or 25 hundred), if after [0-9] there is no spell leave as it is.
String multitokenNum = "";
String multitokenType = ""; // can be c_card or c_ord
String currentPat="";
// Lookup each token in the text in order
for (int i = 0; i < textarr.length; i++) {
// Establish the current pattern
if (patternarr[i].startsWith("c_") || textarr[i].startsWith("v__")) {
currentPat = patternarr[i]; // if there is already a pattern keep it
} else {
// cardinals or cardinal delimiters for initialized spelled nums
if (textarr[i].matches("([0-9]+(?:\\.[0-9]+)?|" + units.getRE() + "|" + tens.getRE() + "|" + irregular_tens.getRE() + "|" + magnitudes.getRE() + "|" + special_groups.getRE() + "|" + units.getRE() + "|" + tens.getRE() + "-" + units.getRE() + ")")
|| (textarr[i].matches(delimiters.getRE()) && multitokenType.equals("c_card") && !multitokenNum.equals("") && !multitokenNum.matches(".*([0-9]).*"))) {
currentPat = "c_card";
} else {
if (textarr[i].matches("[0-9]+"+ordinal_suffixes.getRE()) || textarr[i].matches(ordinal_units.getRE()) || (ordinal_irregular_tens!=null && textarr[i].matches(ordinal_irregular_tens.getRE())) || (ordinal_tens!=null&&textarr[i].matches(ordinal_tens.getRE()+"(-"+ordinal_units.getRE()+")?"))) {
currentPat = "c_ord";
} else {
currentPat = textarr[i];
}
}
}
// check if a multitokenNum ends, if the current token/pattern cannot be combined with the current type
if (!multitokenNum.equals("")
&& ((!currentPat.equals(multitokenType)
|| textarr[i].matches("[0-9]+(?:\\.[0-9]+)?")
|| multitokenNum.matches(ordinal_units.getRE()))
|| (currentPat.equals(multitokenType) && (text2number(multitokenNum + " " + textarr[i])).equals(multitokenNum.trim() + " " + textarr[i])))) {
normtext += " v__" + text2number(multitokenNum.trim());
pattern += " " + multitokenType;
multitokenNum = ""; // initialize
}
// add to normTE or to spelled num
if (currentPat.equals(multitokenType) || multitokenType.equals("") && (currentPat.equals("c_card") ||currentPat.equals("c_ord"))) {
multitokenNum += " " + textarr[i];
multitokenType=currentPat;
} else { // Month/Week could be replaced by a number BUT SINCE THERE ARE DIFFERENT INTERPRETATIONS it is better to leave them as string
normtext += " " + textarr[i];
pattern += " " + currentPat;
}
}
// add last spellednum if exists
if (!multitokenNum.equals("")) {
normtext += " v__" + text2number(multitokenNum.trim());
pattern += " " + multitokenType;
}
} catch (Exception e) {
System.err.println("Errors found:\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
return (normtext.trim() + "|" + pattern.trim());
}
/**
* Counts how many times a char (pattern) appears in a string (source)
*
* @param source
* @param pattern
* @return
*/
public static int countOccurrencesOf(String source, char pattern) {
int count = 0;
if (source != null) {
int found = -1;
int start = 0;
while ((found = source.indexOf(pattern, start)) != -1) {
start = found + 1;
count++;
}
return count;
} else {
return 0;
}
}
/**
* Returns the decimal representation of a Roman number This function only
* works until 3999 since greater numbers use non-ASCII chars (See
* Wikipedia) Rules: Only 3 consecutive chars, 5-like can not be repeated,
* only multiples of 10 can be used to subtract (and must subtract only the
* next two greater values) ...
*
* @param roman
* @return
*/
public static String Roman2Decimal(String roman) {
try {
int dec = 0;
int ant = 0;
char ant_letter = '\0';
if (roman == null || roman.trim().length() == 0) {
return "0";
}
roman = roman.trim().replaceAll("\\s+", "").toUpperCase();
if (!roman.matches("[" + romans + "]+") || roman.matches(".*(.)\\1{3,}.*") || roman.matches(".*([" + romans5 + "]).*\\1.*")) {
throw new Exception("Invalid roman number: " + roman + ". Must only contain " + romans + " and not more than 3 consecutive equal chars are allowed, non-10 power numbers (" + romans5 + ") can only appear once. " + roman);
}
for (int i = 0; i < roman.length(); i++) {
char letter = roman.charAt(i);
int value = romansMap.get("" + letter);
dec = dec + value;
if (i > 0 && roman.length() > 2 && i < roman.length() - 1 && (ant <= value && value < romansMap.get("" + roman.charAt(i + 1)))) {
throw new Exception("Two consecutive subtractions or more than one equal symbols used to subtract " + roman);
}
if (i > 0 && roman.length() > 2 && i < roman.length() - 1 && (ant < value && ant <= romansMap.get("" + roman.charAt(i + 1)))) {
throw new Exception("Substracting and adding the same symbol or greater " + roman);
}
if (i != 0 && ant < value) { // no need to check if ant is 0 because it means substract nothing
double check5 = Math.log10(ant);
if (i != 0 && check5 != (int) check5) {
throw new Exception("Symbols powers of 5 cannot be used to substract: " + ant);
}
if (romans.indexOf(letter) - 2 > romans.indexOf(ant_letter)) {
throw new Exception("With " + ant_letter + " you can only substract " + romans.substring(romans.indexOf(ant_letter) + 1, romans.indexOf(ant_letter) + 3) + ". Incorrect: " + roman);
}
dec = dec - ant * 2;
}
ant = value;
ant_letter = letter;
}
return "" + dec;
} catch (Exception e) {
System.err.println("Errors found (NUMEK):\n\t" + e.toString());
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
}
return null;
}
}
}