/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.interpreters; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import opennlp.tools.util.Cache; import org.apache.log4j.Logger; import org.cogroo.entities.impl.ChunkTag; import org.cogroo.entities.impl.MorphologicalTag; import org.cogroo.entities.impl.SyntacticTag; import org.cogroo.tools.checker.rules.model.TagMask.Case; import org.cogroo.tools.checker.rules.model.TagMask.ChunkFunction; import org.cogroo.tools.checker.rules.model.TagMask.Class; import org.cogroo.tools.checker.rules.model.TagMask.Gender; import org.cogroo.tools.checker.rules.model.TagMask.Mood; import org.cogroo.tools.checker.rules.model.TagMask.Number; import org.cogroo.tools.checker.rules.model.TagMask.Person; import org.cogroo.tools.checker.rules.model.TagMask.Punctuation; import org.cogroo.tools.checker.rules.model.TagMask.SyntacticFunction; import org.cogroo.tools.checker.rules.model.TagMask.Tense; public class JspellTagInterpreter implements TagInterpreter { private static final Map<Enum<?>, String> ENUM_MTAG_PARTS; private static final Map<String, List<Enum<?>>> MTAG_PARTS_ENUM; protected static final Logger LOGGER = Logger .getLogger(JspellTagInterpreter.class); private static final String SEP = "\\|"; private static final List<String> MOOD_INDICATIVE; static { /* ******************************** * Morphologic****************************** */ Map<Enum<?>, String> menumElements = new HashMap<Enum<?>, String>(); /* Class */ menumElements.put(Class.NOUN, "CAT:nc"); menumElements.put(Class.NOUN_ADJECTIVE, "CAT:a_nc"); menumElements.put(Class.PROPER_NOUN, "CAT:np"); menumElements.put(Class.ARTICLE, "CAT:art"); menumElements.put(Class.PREPOSITION, "CAT:prep"); menumElements.put(Class.ADJECTIVE, "CAT:adj"); menumElements.put(Class.ADVERB, "CAT:adv"); menumElements.put(Class.SUBORDINATING_CONJUNCTION, "CAT:conj-s"); menumElements.put(Class.COORDINATING_CONJUNCTION, "CAT:conj-c"); menumElements.put(Class.INTERJECTION, "CAT:in"); menumElements.put(Class.PREFIX, "CAT:pref"); menumElements.put(Class.PERSONAL_PRONOUN, "CAT:ppes"); /* Gender */ menumElements.put(Gender.MALE, "G:m"); menumElements.put(Gender.FEMALE, "G:f"); menumElements.put(Gender.NEUTRAL, "G:n"); /* Number */ menumElements.put(Number.SINGULAR, "N:s"); menumElements.put(Number.PLURAL, "N:p"); menumElements.put(Number.NEUTRAL, "N:n"); /* Case */ menumElements.put(Case.ACCUSATIVE, "C:a"); menumElements.put(Case.DATIVE, "C:d"); menumElements.put(Case.NOMINATIVE, "C:n"); menumElements.put(Case.PREPOSITIVE, "C:g"); // menumElements.put(Case.ACCUSATIVE_DATIVE, "ACC/DAT"); // menumElements.put(Case.NOMINATIVE_PREPOSITIVE, "NOM/PIV"); /* Person */ menumElements.put(Person.FIRST, "P:1"); menumElements.put(Person.SECOND, "P:2"); menumElements.put(Person.THIRD, "P:3"); menumElements.put(Person.FIRST_THIRD, "P:1_3"); /* Tense */ menumElements.put(Tense.PRESENT, "T:p"); menumElements.put(Tense.PRETERITO_IMPERFEITO, "T:pi"); menumElements.put(Tense.PRETERITO_PERFEITO, "T:pp"); menumElements.put(Tense.PRETERITO_MAIS_QUE_PERFEITO, "T:pmp"); menumElements.put(Tense.FUTURE, "T:f"); menumElements.put(Tense.CONDITIONAL, "T:c"); // menumElements.put(Tense.PRETERITO_PERFEITO_MAIS_QUE_PERFEITO, "T:pmp"); /* Mood */ // menumElements.put(Mood.INDICATIVE, "IND"); // menumElements.put(Mood.SUBJUNCTIVE, "SUBJ"); menumElements.put(Mood.IMPERATIVE, "T:i"); /* Punctuation */ // menumElements.put(Punctuation.ABS, "ABS"); // menumElements.put(Punctuation.NSEP, "NSEP"); // menumElements.put(Punctuation.BIN, "BIN"); // menumElements.put(Punctuation.REL, "REL"); ENUM_MTAG_PARTS = Collections.unmodifiableMap(menumElements); Set<Enum<?>> k2 = ENUM_MTAG_PARTS.keySet(); Map<String, List<Enum<?>>> stringMElements = new HashMap<String, List<Enum<?>>>( 60); for (Enum<?> tagE : k2) { ArrayList<Enum<?>> values = new ArrayList<Enum<?>>(); values.add(tagE); stringMElements.put(ENUM_MTAG_PARTS.get(tagE), Collections.unmodifiableList(values)); } // gender ArrayList<Enum<?>> _Gn = new ArrayList<Enum<?>>(); _Gn.add(Gender.NEUTRAL); stringMElements.put("G:2", Collections.unmodifiableList(_Gn)); stringMElements.put("G:_", Collections.unmodifiableList(_Gn)); // number ArrayList<Enum<?>> _Nn = new ArrayList<Enum<?>>(); _Nn.add(Number.NEUTRAL); stringMElements.put("N:_", Collections.unmodifiableList(_Nn)); ArrayList<Enum<?>> _Np = new ArrayList<Enum<?>>(); _Np.add(Number.PLURAL); stringMElements.put("DN:p", Collections.unmodifiableList(_Np)); ArrayList<Enum<?>> _Ns = new ArrayList<Enum<?>>(); _Ns.add(Number.SINGULAR); stringMElements.put("DN:s", Collections.unmodifiableList(_Ns)); // person ArrayList<Enum<?>> _P1 = new ArrayList<Enum<?>>(); _P1.add(Person.FIRST); stringMElements.put("AP:1", Collections.unmodifiableList(_P1)); ArrayList<Enum<?>> _P2 = new ArrayList<Enum<?>>(); _P2.add(Person.SECOND); stringMElements.put("AP:2", Collections.unmodifiableList(_P2)); ArrayList<Enum<?>> _P3 = new ArrayList<Enum<?>>(); _P3.add(Person.THIRD); stringMElements.put("AP:3", Collections.unmodifiableList(_P3)); stringMElements.put("DP:3", Collections.unmodifiableList(_P3)); // Tense ArrayList<Enum<?>> _Tfc = new ArrayList<Enum<?>>(); _Tfc.add(Tense.FUTURE); _Tfc.add(Mood.SUBJUNCTIVE); stringMElements.put("T:fc", Collections.unmodifiableList(_Tfc)); ArrayList<Enum<?>> _Tpc = new ArrayList<Enum<?>>(); _Tpc.add(Tense.PRESENT); _Tpc.add(Mood.SUBJUNCTIVE); stringMElements.put("T:pc", Collections.unmodifiableList(_Tpc)); ArrayList<Enum<?>> _Tpic = new ArrayList<Enum<?>>(); _Tpic.add(Tense.PRETERITO_IMPERFEITO); _Tpic.add(Mood.SUBJUNCTIVE); stringMElements.put("T:pic", Collections.unmodifiableList(_Tpic)); // indicative we to using software String[] ind = { "T:f", "T:p", "T:pi", "T:pmp", "T:pp" }; MOOD_INDICATIVE = Collections.unmodifiableList(Arrays.asList(ind)); MTAG_PARTS_ENUM = Collections.unmodifiableMap(stringMElements); } public JspellTagInterpreter() { } // private final Map<String, MorphologicalTag> cache = new HashMap<String, // MorphologicalTag>(); private final Cache cache = new Cache(200); public MorphologicalTag parseMorphologicalTag(String tagString) { if (tagString == null) { return null; } synchronized (cache) { if (cache.containsKey(tagString)) { return ((MorphologicalTag) cache.get(tagString)).clone(); } } MorphologicalTag m = new MorphologicalTag(); String[] tags = tagString.split(SEP); for (String tag : tags) { if (MTAG_PARTS_ENUM.containsKey(tag)) { List<Enum<?>> tagE = MTAG_PARTS_ENUM.get(tag); for (Enum<?> t : tagE) { if (t instanceof Class) { m.setClazz((Class) t); } else if (t instanceof Gender) { m.setGender((Gender) t); } else if (t instanceof Number) { m.setNumber((Number) t); } else if (t instanceof Case) { m.setCase((Case) t); } else if (t instanceof Person) { m.setPerson((Person) t); } else if (t instanceof Tense) { if (MOOD_INDICATIVE.contains(tag)) { m.setMood(Mood.INDICATIVE); } m.setTense((Tense) t); } else if (t instanceof Mood) { m.setMood((Mood) t); } else if (t instanceof Punctuation) { m.setPunctuation((Punctuation) t); } } } else { if (tag.startsWith("CAT:")) { if (tag.startsWith("CAT:punct")) { m.setClazz(Class.PUNCTUATION_MARK); } else if ("CAT:v".equals(tag) && m.getClazzE() == null) { m.setClazz(Class.FINITIVE_VERB); } else if ("CAT:ppos".equals(tag) || "CAT:pind".equals(tag) || "CAT:pdem".equals(tag) || "CAT:pint".equals(tag) || "CAT:prel".equals(tag)) { m.setClazz(Class.PRONOUN); } else if ("CAT:card".equals(tag) || "CAT:nord".equals(tag)) { m.setClazz(Class.NUMERAL); } else if (tag.startsWith("CAT:adj")) { m.setClazz(Class.ADJECTIVE); } else if ("CAT:cp".equals(tag) || "CAT:pass".equals(tag)) { return null; // ignore this tag } } else if (tag.startsWith("T:")) { if (MOOD_INDICATIVE.contains(tag)) { m.setMood(Mood.INDICATIVE); } if ((Class.FINITIVE_VERB.equals(m.getClazzE()) || m.getClazzE() == null)) { if ("T:inf".equals(tag) || "T:ip".equals(tag)) { m.setClazz(Class.INFINITIVE_VERB); } else if ("T:ppa".equals(tag)) { m.setClazz(Class.PARTICIPLE_VERB); } else if ("T:g".equals(tag)) { m.setClazz(Class.GERUND_VERB); } } } else if (tag.length() == 1 || "--".equals(tag) || "...".equals(tag)) { m.setClazz(Class.PUNCTUATION_MARK); } else if ("n:".equals(tag)) { m.setClazz(Class.NOUN); } else if ("intj".equals(tag)) { m.setClazz(Class.INTERJECTION); } else if ("pp".equals(tag)) { m.setClazz(Class.PREPOSITION); } else if ("np".equals(tag)) { m.setClazz(Class.NOUN); } else { System.out.println(tag); } } } if (m.toString() == null || m.toString().length() == 0) { LOGGER.error("Invalid MorphologicalTag: " + tagString); } // post process if (m.getGenderE() == null && m.getNumberE() != null) { if (Class.NOUN.equals(m.getClazzE()) || Class.NOUN_ADJECTIVE.equals(m.getClazzE()) || Class.NUMERAL.equals(m.getClazzE())) { m.setGender(Gender.NEUTRAL); } else if (Class.PROPER_NOUN.equals(m.getClazzE())) { m.setGender(Gender.MALE); } } if (m.getNumberE() == null && m.getGenderE() != null) { if (Class.NOUN.equals(m.getClazzE()) || Class.NOUN_ADJECTIVE.equals(m.getClazzE()) || Class.NUMERAL.equals(m.getClazzE())) { m.setNumber(Number.NEUTRAL); } else if (Class.PROPER_NOUN.equals(m.getClazzE())) { m.setNumber(Number.SINGULAR); } } if (m.getTense() != null && Class.NOUN.equals(m.getClazzE())) { m.setClazz(Class.INFINITIVE_VERB); } if(m == null || m.getClazzE() == null) { LOGGER.warn("something wrong with tag: " + tagString); } removeInvalidFeatures(m); synchronized (cache) { if (!cache.containsKey(tagString)) { cache.put(tagString, m.clone()); } } return m; } private void removeInvalidFeatures(MorphologicalTag m) { if(m != null && m.getClazzE() != null) { switch (m.getClazzE()) { case ADVERB: m.setCase(null); m.setGender(null); m.setMood(null); m.setNumber(null); // bastantes, quantos m.setPerson(null); m.setPunctuation(null); m.setTense(null); break; case NOUN: case PRONOUN: m.setPerson(null); m.setMood(null); m.setCase(null); break; case FINITIVE_VERB: m.setGender(null); break; case INFINITIVE_VERB: m.setCase(null); m.setGender(null); m.setMood(null); m.setPunctuation(null); m.setTense(null); default: break; } } } public ChunkTag parseChunkTag(String tagString) { return null; } public SyntacticTag parseSyntacticTag(String tagString) { return null; } public String serialize(MorphologicalTag tag) { StringBuilder res = new StringBuilder(); if (tag.getClazzE() != null) { res.append(serializer(tag.getClazzE()) + SEP); } if (tag.getGenderE() != null) { res.append(serializer(tag.getGenderE()) + SEP); } if (tag.getTense() != null) { res.append(serializer(tag.getTense()) + SEP); } if (tag.getMood() != null && tag.getTense() == null) { res.append(serializer(tag.getMood()) + SEP); } if (tag.getPersonE() != null && tag.getNumberE() != null) { String s = serializer(tag.getPersonE()); if (!(s.contains("S") || s.contains("P"))) { res.append(s + serializer(tag.getNumberE()) + SEP); } else { res.append(s + SEP); } } else if (tag.getNumberE() != null) { res.append(serializer(tag.getNumberE()) + SEP); } if (tag.getCase() != null) { res.append(serializer(tag.getCase()) + SEP); } if (tag.getMood() != null && tag.getTense() != null) { res.append(serializer(tag.getMood()) + SEP); } // if(tag.getFinitenessE() != null) { // res.append(serializer(tag.getFinitenessE()) + SEP); // } if (tag.getPunctuation() != null) { res.append(serializer(tag.getPunctuation()) + SEP); } if (res.length() == 0) { LOGGER.error("Unable to serialize MorphologicalTag: " + tag); } if (res.length() > 1) { return res.substring(0, res.length() - 1); } else { return null; } } public String serialize(ChunkTag tag) { String value = serializer(tag.getChunkFunction()); return value; } public String serialize(SyntacticTag tag) { String value = serializer(tag.getSyntacticFunction()); return value; } public String serialize(SyntacticFunction tag) { return serializer(tag); } public String serialize(ChunkFunction tag) { return serializer(tag); } public String serialize(Class tag) { return serializer(tag); } public String serialize(Gender tag) { return serializer(tag); } public String serialize(Number tag) { return serializer(tag); } public String serialize(Case tag) { return serializer(tag); } public String serialize(Person tag) { return serializer(tag); } public String serialize(Tense tag) { return serializer(tag); } public String serialize(Mood tag) { return serializer(tag); } public String serialize(Punctuation tag) { return serializer(tag); } private String serializer(Enum<?> value) { if (ENUM_MTAG_PARTS.containsKey(value)) { return ENUM_MTAG_PARTS.get(value); } return ""; } }