package hu.u_szeged.nlp.pos.converter; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; public class MSDReducer { private Map<String, String> cache = null; private final Pattern NOUN_PATTERN_1 = Pattern.compile("N.-..---s3"); private final Pattern NOUN_PATTERN_2 = Pattern.compile("N.-..---..s"); private final Pattern ADJECTIVE_PATTERN_1 = Pattern.compile("A..-..-.--s3"); private final Pattern ADJECTIVE_PATTERN_2 = Pattern.compile("A..-..-.--..s"); private final Pattern NUMERAL_PATTERN_1 = Pattern.compile("M.-...-.--s3"); private final Pattern NUMERAL_PATTERN_2 = Pattern.compile("M.-...-.--..s"); private final Pattern OPEN_PATTERN_1 = Pattern.compile("O..-..---s3"); private final Pattern OPEN_PATTERN_2 = Pattern.compile("O..-..---..s"); private final Pattern VERB_PATTERN_1 = Pattern.compile("V[^a]cp[12]p---y"); private final Pattern VERB_PATTERN_2 = Pattern.compile("V[^a]ip1s---y"); private final Pattern VERB_PATTERN_3 = Pattern.compile("V[^a]cp3p---y"); private final Pattern VERB_PATTERN_4 = Pattern.compile("V[^a]is1[sp]---y"); public MSDReducer() { this.setCache(new HashMap<String, String>()); } /** * reduce noun * * @param msd */ private String reduceN(String msd) { StringBuffer result = null; result = new StringBuffer("N"); // dative/genitive // superessive/essive if (msd.length() > 4 && (msd.charAt(4) == 'd' || msd.charAt(4) == 'g' || msd.charAt(4) == 'p')) { result.append(msd.charAt(4)); } // N.-..---s3 if (NOUN_PATTERN_1.matcher(msd).find()) { result.append("s"); } // N.-..---..s if (NOUN_PATTERN_2.matcher(msd).find()) { result.append("z"); } return result.toString(); } private String reduceO(String msd) { StringBuffer result = null; result = new StringBuffer("O"); // dative/genitive // superessive/essive if (msd.length() > 5 && (msd.charAt(5) == 'd' || msd.charAt(5) == 'g' || msd.charAt(5) == 'p')) { result.append(msd.charAt(5)); } // O..-..---s3 if (OPEN_PATTERN_1.matcher(msd).find()) { result.append("s"); } // O..-..---..s if (OPEN_PATTERN_2.matcher(msd).find()) { result.append("z"); } return result.toString(); } /** * reduce verb */ private String reduceV(String msd) { String result = null; // Va if (msd.startsWith("Va")) { result = "Va"; } // festett�l // Vsis[123][sp]---[yn] // else if (msd.startsWith("Vsis") && !msd.equals("Vsis3s---n")) { // if (msd.endsWith("---y")) // // 1 // result = "Vsy"; // else // result = "Vs"; // } // olvasn�nk // V[^a]cp1p---y else if (VERB_PATTERN_1.matcher(msd).find()) { result = "Vcp"; } // eszek eszem // V[^a]ip1s---y else if (VERB_PATTERN_2.matcher(msd).find()) { result = "Vip"; } // festetn�k // V[^a]cp3p---y // private final Pattern VERB_PATTERN_3 = Pattern.compile("V[^a]cp3p---y"); else if (VERB_PATTERN_3.matcher(msd).find()) { // if (msd.charAt(1) == 's') // result = "Vs3p"; // else result = "V3p"; } // festettem // V s is[123][sp]---[yn] // V[^a]is 1 [sp]---y else if (VERB_PATTERN_4.matcher(msd).find()) { // if (msd.charAt(1) == 's') // //2 // result = "Vs1y"; // else result = "Vy"; } // V-m felszlito mod else if (msd.length() > 2 && msd.charAt(2) == 'm') { result = "Vm"; } // V--p jelen ido egybeeshet multtal pl.: �rt else if (msd.length() > 3 && msd.charAt(3) == 'p') { result = "Vp"; } else { result = "V"; } return result; } /** * reduce adjective * * @param msd * @return */ private String reduceA(String msd) { StringBuffer result = null; result = new StringBuffer("A"); // dative/genitive // superessive/essive if (msd.length() > 5 && (msd.charAt(5) == 'd' || msd.charAt(5) == 'g' || msd.charAt(5) == 'p')) { result.append(msd.charAt(5)); } // A..-..-.--s3 if (ADJECTIVE_PATTERN_1.matcher(msd).find()) { result.append("s"); } // A..-..-.--..s if (ADJECTIVE_PATTERN_2.matcher(msd).find()) { result.append("z"); } return result.toString(); } /** * reduce pronoun * * @param msd * @return */ private String reduceP(String msd) { StringBuffer result = null; result = new StringBuffer("P"); // Pq Pr Pp if (msd.length() > 1 && (msd.charAt(1) == 'q' || msd.charAt(1) == 'r' || msd.charAt(1) == 'p')) { if (msd.charAt(1) == 'p') result.append('e'); else result.append(msd.charAt(1)); } // dative/genitive // superessive/essive if (msd.length() > 5 && (msd.charAt(5) == 'd' || msd.charAt(5) == 'g' || msd.charAt(5) == 'p')) { result.append(msd.charAt(5)); } return result.toString(); } /** * reduce adverb * * @param msd * @return */ private String reduceR(String msd) { StringBuffer result = null; result = new StringBuffer("R"); // Rq Rr Rp Rl if (msd.length() > 1 && (msd.charAt(1) == 'q' || msd.charAt(1) == 'r' || msd.charAt(1) == 'p' || msd.charAt(1) == 'l')) { result.append(msd.charAt(1)); } return result.toString(); } private String reduceM(String msd) { StringBuffer result = null; result = new StringBuffer("M"); // fractal if (msd.length() > 1 && msd.charAt(1) == 'f') { result.append(msd.charAt(1)); } // dative/genitive // superessive/essive if (msd.length() > 4 && (msd.charAt(4) == 'd' || msd.charAt(4) == 'g' || msd.charAt(4) == 'p')) { result.append(msd.charAt(4)); } // M.-...-.--s3 if (NUMERAL_PATTERN_1.matcher(msd).find()) { result.append("s"); } // M.-...-.--..s if (NUMERAL_PATTERN_2.matcher(msd).find()) { result.append("z"); } return result.toString(); } public String reduce(String msd) { if (this.getCache().containsKey(msd)) { return this.getCache().get(msd); } String reduced = null; if (msd.length() == 1) return msd; switch (msd.charAt(0)) { case 'N': reduced = reduceN(msd); break; case 'V': reduced = reduceV(msd); break; case 'A': reduced = reduceA(msd); break; case 'P': reduced = reduceP(msd); break; case 'R': reduced = reduceR(msd); break; case 'M': reduced = reduceM(msd); break; case 'O': reduced = reduceO(msd); break; case 'C': reduced = msd; break; // T, S, I, X, Y, Z default: reduced = String.valueOf(msd.charAt(0)); } this.getCache().put(msd, reduced); return reduced; } public void setCache(Map<String, String> cache) { this.cache = cache; } public Map<String, String> getCache() { return cache; } public void printCache() { for (Map.Entry<String, String> entry : this.getCache().entrySet()) { System.out.println(entry.getKey() + "\t" + entry.getValue()); } } public static void main(String args[]) throws IOException { MSDReducer reducer = null; reducer = new MSDReducer(); System.out.println(reducer.reduce("Pp3-sp")); System.out.println(reducer.reduce("Ccsw")); System.out.println(reducer.reduce("Pq3-ses--------3")); System.out.println(reducer.reduce("Px3-sn----------s")); // reducer.printCache(); } }