package hu.u_szeged.nlp.pos.guesser; /** * Developed by: * Research Group on Artificial Intelligence of the Hungarian Academy of Sciences * http://www.inf.u-szeged.hu/rgai/ * * Contact: * János Zsibrita * zsibrita@inf.u-szeged.hu * * Licensed by Creative Commons Attribution Share Alike * * http://creativecommons.org/licenses/by-sa/3.0/legalcode */ import hu.u_szeged.nlp.pos.MagyarlancResourceHolder; import hu.u_szeged.nlp.pos.MorAna; import hu.u_szeged.nlp.pos.Util; import java.util.HashSet; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Minden számmal kezdődő token elemzését a NumberGuesser osztály végzi, reguláris kifeje-zések segítségével. Egy * szólakhoz több elemzés is tartozhat. Egy számmal kezdődő token lehet főnév (N) (pl.: 386-os@Nc-sn), melléknév * (pl.: 16-ai@Afp-sn), számnév (pl. 5.@Mo-snd) vagy nyílt tokenosztályba tartozó (pl.: 20%@Onp-sn). */ public class NumberGuesser { // main number pattern private final static Pattern PATTERN_0 = Pattern.compile("[0-9]+.*"); // 1-es 1.3-as 1,5-ös 1/6-os 16-17-es [Afp-sn, Nn-sn] private static Pattern PATTERN_1 = Pattern.compile("([0-9]+[0-9\\.,%-/]*-(as|es|os|ös)+)([a-zA-Záéíóöőúüű]*)"); // 16-i private static Pattern PATTERN_2 = Pattern.compile("[0-9]+[0-9\\.,-/]*-i"); // 16-(ai/ei/jei) private static Pattern PATTERN_3 = Pattern.compile("([0-9]+-(ai|ei|jei)+)([a-zA-Záéíóöőúüű]*)"); // +12345 private static Pattern PATTERN_4 = Pattern.compile("([\\+|\\-]{1}[0-9]+[0-9\\.,-/]*)-??([a-zA-Záéíóöőúüű]*)"); // 12345-12345 private static Pattern PATTERN_5 = Pattern.compile("([0-9]+-[0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 12:30 12.30 Ont-sn private static Pattern PATTERN_6 = Pattern.compile("(([0-9]{1,2})[\\.:]({1}[0-9]{2}))-??([a-zA-Záéíóöőúüű]*)"); // 123,45-12345 private static Pattern PATTERN_7 = Pattern.compile("([0-9]+,[0-9]+-[0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 12345-12345,12345 private static final Pattern PATTERN_8 = Pattern.compile("([0-9]+-[0-9]+,[0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 12345,12345-12345,12345 private static final Pattern PATTERN_9 = Pattern.compile("([0-9]+,[0-9]+-[0-9]+,[0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 12345.12345,12345 private static final Pattern PATTERN_10 = Pattern.compile("([0-9]+\\.[0-9]+,[0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 10:30 private static final Pattern PATTERN_11 = Pattern.compile("([0-9]+:[0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 12345.12345.1234-. private static final Pattern PATTERN_12 = Pattern.compile("([0-9]+\\.[0-9]+[0-9\\.]*)-??([a-zA-Záéíóöőúüű]*)"); // 12,3-nak private static final Pattern PATTERN_13 = Pattern.compile("([0-9]+,[0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 20-nak private static final Pattern PATTERN_14 = Pattern.compile("([0-9]+)-??([a-zA-Záéíóöőúüű]*)"); // 20. private static final Pattern PATTERN_15 = Pattern.compile("(([0-9]+-??[0-9]*)\\.)-??([a-zA-Záéíóöőúüű]*)"); // 16-áig private static final Pattern PATTERN_16 = Pattern.compile("(([0-9]{1,2})-(á|é|jé))([a-zA-Záéíóöőúüű]*)"); // 16-a private static final Pattern PATTERN_17 = Pattern.compile("(([0-9]{1,2})-(a|e|je))()"); // 50% private static final Pattern PATTERN_18 = Pattern.compile("([0-9]+,??[0-9]*%)-??([a-zA-Záéíóöőúüű]*)"); private static String nounToNumeral(String nounMsd, String numeralMsd) { StringBuffer msd = null; msd = new StringBuffer(numeralMsd); // szam if (nounMsd.length() > 3) msd.setCharAt(3, nounMsd.charAt(3)); // eset if (nounMsd.length() > 4) msd.setCharAt(4, nounMsd.charAt(4)); // birtokos szama if (nounMsd.length() > 8) msd.setCharAt(10, nounMsd.charAt(8)); // birtokos szemelye if (nounMsd.length() > 9) msd.setCharAt(11, nounMsd.charAt(9)); // birtok(olt) szama if (nounMsd.length() > 10) msd.setCharAt(12, nounMsd.charAt(10)); return MagyarlancResourceHolder.getKRToMSD().cleanMsd(msd.toString()); } private static String nounToOther(String nounMsd, String otherMsd) { StringBuffer msd = null; msd = new StringBuffer(otherMsd); // szam if (nounMsd.length() > 3) msd.setCharAt(4, nounMsd.charAt(3)); // eset if (nounMsd.length() > 4) msd.setCharAt(5, nounMsd.charAt(4)); // birtokos szama if (nounMsd.length() > 8) msd.setCharAt(9, nounMsd.charAt(8)); // birtokos szemelye if (nounMsd.length() > 9) msd.setCharAt(10, nounMsd.charAt(9)); // birtok(olt) szama if (nounMsd.length() > 10) msd.setCharAt(11, nounMsd.charAt(10)); return MagyarlancResourceHolder.getKRToMSD().cleanMsd(msd.toString()); } private static String nounToNoun(String nounMsd, String otherMsd) { StringBuffer msd = null; msd = new StringBuffer(otherMsd); // szam if (nounMsd.length() > 3) msd.setCharAt(3, nounMsd.charAt(3)); // eset if (nounMsd.length() > 4) msd.setCharAt(4, nounMsd.charAt(4)); return MagyarlancResourceHolder.getKRToMSD().cleanMsd(msd.toString()); } private static int romanToArabic(String romanNumber) { char romanChars[] = { 'I', 'V', 'X', 'L', 'C', 'D', 'M' }; int arabicNumbers[] = { 1, 5, 10, 50, 100, 500, 1000 }; int temp[] = new int[20]; int sum = 0; for (int i = 0; i < romanNumber.toCharArray().length; i++) { for (int j = 0; j < romanChars.length; j++) { if (romanNumber.charAt(i) == romanChars[j]) { temp[i] = arabicNumbers[j]; } } } for (int i = 0; i < temp.length; i++) { if (i == temp.length - 1) { sum += temp[i]; } else { if (temp[i] < temp[i + 1]) { sum += (temp[i + 1] - temp[i]); i++; } else { sum += temp[i]; } } } return sum; } /** * számmal kezdődő token elemzése * * @param number * egy (számmal kezdődő) String * @return lehetséges elemzéseket (lemma-msd párok) */ public static Set<MorAna> guess(String number) { Matcher matcher = null; Set<MorAna> stemSet = null; stemSet = new TreeSet<MorAna>(); String root = null; String suffix = null; // base number pattern matcher = PATTERN_0.matcher(number); if (!matcher.matches()) { return stemSet; } matcher = PATTERN_1.matcher(number); if (matcher.matches()) { root = matcher.group(1); // group 3!!! // 386-osok (386-(os))(ok) suffix = matcher.group(3); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, stem.getMsd())); stemSet.add(new MorAna(root, stem.getMsd().replace("Nc", "Afp"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(matcher.group(1), "Afp-sn")); stemSet.add(new MorAna(matcher.group(1), "Nc-sn")); } return stemSet; } // 16-i matcher = PATTERN_2.matcher(number); if (matcher.matches()) { stemSet.add(new MorAna(number, "Afp-sn")); stemSet.add(new MorAna(number, "Onf-sn")); return stemSet; } // 16-(ai/ei/1-jei) matcher = PATTERN_3.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(3); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, "Afp-" + stem.getMsd().substring(3))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(matcher.group(1), "Afp-sn")); } return stemSet; } // +/-12345 matcher = PATTERN_4.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Ons----------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Ons-sn")); } return stemSet; } // 12:30 12.30 Ont-sn matcher = PATTERN_6.matcher(number); if (matcher.matches()) { if (Integer.parseInt(matcher.group(2)) < 24 && Integer.parseInt(matcher.group(3)) < 60) { root = matcher.group(1); suffix = matcher.group(4); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Ont---------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Ont-sn")); } } } // 12345-12345-* matcher = PATTERN_5.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Onr---------"))); stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Onf----------"))); stemSet.add(new MorAna(root, nounToNumeral(stem.getMsd(), "Mc---d-------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Onr-sn")); stemSet.add(new MorAna(number, "Onf-sn")); stemSet.add(new MorAna(number, "Mc-snd")); } return stemSet; } // 12345,12345-12345,12345-* // 12345-12345,12345-* // 12345,12345-12345-* matcher = PATTERN_7.matcher(number); if (!matcher.matches()) { matcher = PATTERN_8.matcher(number); } if (!matcher.matches()) { matcher = PATTERN_9.matcher(number); } if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToNumeral(stem.getMsd(), "Mf---d-------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Mf-snd")); } return stemSet; } // 12345.12345,12345 matcher = PATTERN_10.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Ond---------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Ond-sn")); } return stemSet; } // 10:30-* matcher = PATTERN_11.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) { for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Onf---------"))); stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Onq---------"))); stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Onr---------"))); } } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Onf-sn")); stemSet.add(new MorAna(number, "Onq-sn")); stemSet.add(new MorAna(number, "Onr-sn")); } return stemSet; } // 12345.12345.1234-. matcher = PATTERN_12.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) { for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Oi----------"))); stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Ond---------"))); } } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Oi--sn")); stemSet.add(new MorAna(number, "Ond-sn")); } return stemSet; } // 16-a 17-e 16-áig 17-éig 1-je 1-jéig matcher = PATTERN_16.matcher(number); if (!matcher.matches()) { matcher = PATTERN_17.matcher(number); } if (matcher.matches()) { root = matcher.group(2); suffix = matcher.group(4); if (suffix.length() > 0) { for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToNumeral(stem.getMsd(), "Mc---d----s3-"))); if (Util.isDate(matcher.group(2))) { stemSet.add(new MorAna(root + ".", nounToNoun(stem.getMsd(), "Nc------s3-"))); } if (matcher.group(3).equals("é")) { stemSet.add(new MorAna(root, nounToNumeral(stem.getMsd(), "Mc---d------s"))); } } } if (stemSet.size() == 0) { stemSet.add(new MorAna(matcher.group(2), "Mc-snd----s3")); if (Util.isDate(matcher.group(2))) { stemSet.add(new MorAna(matcher.group(2) + ".", "Nc-sn---s3")); } } return stemSet; } // 50% matcher = PATTERN_18.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToOther(stem.getMsd(), "Onp---------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(root, "Onp-sn")); } return stemSet; } // 12,3-nak matcher = PATTERN_13.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToNumeral(stem.getMsd(), "Mf---d-------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Mf-snd")); } return stemSet; } // 20-nak matcher = PATTERN_14.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(2); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToNumeral(stem.getMsd(), "Mc---d-------"))); } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Mc-snd")); } return stemSet; } // 15. matcher = PATTERN_15.matcher(number); if (matcher.matches()) { root = matcher.group(1); suffix = matcher.group(3); if (suffix.length() > 0) for (MorAna stem : MorPhonGuesser.guess(root, suffix)) { stemSet.add(new MorAna(root, nounToNumeral(stem.getMsd(), "Mo---d-------"))); if (Util.isDate(matcher.group(2))) { stemSet.add(new MorAna(root, stem.getMsd())); } } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Mo-snd")); if (Util.isDate(matcher.group(2))) { stemSet.add(new MorAna(number, "Nc-sn")); stemSet.add(new MorAna(number, "Nc-sn---s3")); } } return stemSet; } if (stemSet.size() == 0) { stemSet.add(new MorAna(number, "Oi--sn")); } return stemSet; } public static Set<MorAna> guessRomanNumber(String word) { Set<MorAna> stemSet = null; stemSet = new HashSet<MorAna>(); // MCMLXXXIV if (word.matches("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$")) { stemSet.add(new MorAna(String.valueOf(romanToArabic(word)), "Mc-snr")); } // MCMLXXXIV. else if (word.matches("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\.$")) { stemSet.add(new MorAna(String.valueOf(romanToArabic(word.substring(0, word.length() - 1))) + ".", "Mo-snr")); } // MCMLXXXIV-MMIX else if (word .matches("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})-M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$")) { stemSet.add(new MorAna(String.valueOf(romanToArabic(word.substring(0, word.indexOf("-")))) + "-" + String.valueOf(romanToArabic(word.substring(word.indexOf("-") + 1, word.length()))), "Mc-snr")); } // MCMLXXXIV-MMIX. else if (word .matches("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})-M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\.$")) { stemSet.add(new MorAna(String.valueOf(romanToArabic(word.substring(0, word.indexOf("-")))) + "-" + String.valueOf(romanToArabic(word.substring(word.indexOf("-") + 1, word.length()))) + ".", "Mo-snr")); } return stemSet; } public static void main(String[] args) { System.out.println(NumberGuesser.guess("386-os")); System.out.println(NumberGuesser.guess("16-ai")); System.out.println(NumberGuesser.guess("5.")); System.out.println(NumberGuesser.guess("20%")); } }