/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.interpreters;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import opennlp.tools.util.Cache;
import org.apache.log4j.Logger;
import org.cogroo.entities.impl.ChunkTag;
import org.cogroo.entities.impl.MorphologicalTag;
import org.cogroo.entities.impl.SyntacticTag;
import org.cogroo.tools.checker.rules.model.TagMask.Case;
import org.cogroo.tools.checker.rules.model.TagMask.ChunkFunction;
import org.cogroo.tools.checker.rules.model.TagMask.Class;
import org.cogroo.tools.checker.rules.model.TagMask.Gender;
import org.cogroo.tools.checker.rules.model.TagMask.Mood;
import org.cogroo.tools.checker.rules.model.TagMask.Number;
import org.cogroo.tools.checker.rules.model.TagMask.Person;
import org.cogroo.tools.checker.rules.model.TagMask.Punctuation;
import org.cogroo.tools.checker.rules.model.TagMask.SyntacticFunction;
import org.cogroo.tools.checker.rules.model.TagMask.Tense;
public class FlorestaTagInterpreter implements TagInterpreter {
private static final Map<Enum<?>, String> ENUM_MTAG_PARTS;
private static final Map<String, List<Enum<?>>> MTAG_PARTS_ENUM;
private static final Map<ChunkFunction, String> ENUM_CTAG_PARTS;
private static final Map<String, List<ChunkFunction>> CTAG_PARTS_ENUM;
private static final Map<SyntacticFunction, String> ENUM_STAG_PARTS;
private static final Map<String, List<SyntacticFunction>> STAG_PARTS_ENUM;
protected static final Logger LOGGER = Logger
.getLogger(FlorestaTagInterpreter.class);
private static final String SEP = "=";
static {
/* ********************************
* Chunk /* *******************************
*/
Map<ChunkFunction, String> cenumElements = new HashMap<ChunkFunction, String>(
7);
cenumElements.put(ChunkFunction.OTHER, "O");
cenumElements.put(ChunkFunction.BOUNDARY_NOUN_PHRASE, "B-NP");
cenumElements.put(ChunkFunction.BOUNDARY_NOUN_PHRASE_MAIN, "B-NP*");
cenumElements.put(ChunkFunction.INTERMEDIARY_NOUN_PHRASE, "I-NP");
cenumElements.put(ChunkFunction.INTERMEDIARY_NOUN_PHRASE_MAIN, "I-NP*");
cenumElements.put(ChunkFunction.BOUNDARY_VERB_PHRASE_MAIN, "B-VP*");
cenumElements.put(ChunkFunction.INTERMEDIARY_VERB_PHRASE, "I-VP");
ENUM_CTAG_PARTS = Collections.unmodifiableMap(cenumElements);
Set<ChunkFunction> k = ENUM_CTAG_PARTS.keySet();
Map<String, List<ChunkFunction>> stringCElements = new HashMap<String, List<ChunkFunction>>(
7);
for (ChunkFunction tagE : k) {
ArrayList<ChunkFunction> values = new ArrayList<ChunkFunction>();
values.add(tagE);
stringCElements.put(ENUM_CTAG_PARTS.get(tagE),
Collections.unmodifiableList(values));
}
stringCElements.put("B-VP", Collections.singletonList(ChunkFunction.BOUNDARY_VERB_PHRASE_MAIN));
CTAG_PARTS_ENUM = Collections.unmodifiableMap(stringCElements);
/* ********************************
* Syntactic /* *******************************
*/
Map<SyntacticFunction, String> senumElements = new HashMap<SyntacticFunction, String>(
3);
senumElements.put(SyntacticFunction.NONE, "-");
senumElements.put(SyntacticFunction.SUBJECT, "SUBJ");
senumElements.put(SyntacticFunction.VERB, "P");
senumElements.put(SyntacticFunction.INDIRECT_OBJECT, "PIV");
senumElements.put(SyntacticFunction.DIRECT_OBJECT, "ACC");
senumElements.put(SyntacticFunction.SUBJECT_PREDICATIVE, "SC");
senumElements.put(SyntacticFunction.IDENTIFYING_APPOSITION, "APP");
senumElements.put(SyntacticFunction.ADVERBIAL_ADJUNCT, "ADVL");
ENUM_STAG_PARTS = Collections.unmodifiableMap(senumElements);
Set<SyntacticFunction> k1 = ENUM_STAG_PARTS.keySet();
Map<String, List<SyntacticFunction>> stringSElements = new HashMap<String, List<SyntacticFunction>>(
3);
for (SyntacticFunction tagE : k1) {
ArrayList<SyntacticFunction> values = new ArrayList<SyntacticFunction>();
values.add(tagE);
stringSElements.put(ENUM_STAG_PARTS.get(tagE),
Collections.unmodifiableList(values));
}
ArrayList<SyntacticFunction> _OtherVerbs = new ArrayList<SyntacticFunction>();
_OtherVerbs.add(SyntacticFunction.VERB);
stringSElements.put("AUX", Collections.unmodifiableList(_OtherVerbs));
stringSElements.put("PAUX", Collections.unmodifiableList(_OtherVerbs));
stringSElements.put("MV", Collections.unmodifiableList(_OtherVerbs));
stringSElements.put("PMV", Collections.unmodifiableList(_OtherVerbs));
STAG_PARTS_ENUM = Collections.unmodifiableMap(stringSElements);
/* ********************************
* Morphologic /* *******************************
*/
Map<Enum<?>, String> menumElements = new HashMap<Enum<?>, String>();
/* Class */
menumElements.put(Class.NOUN, "n");
menumElements.put(Class.PROPER_NOUN, "prop");
menumElements.put(Class.ARTICLE, "art");// collision
menumElements.put(Class.PREPOSITION, "prp");
menumElements.put(Class.ADJECTIVE, "adj");
menumElements.put(Class.ADVERB, "adv");
menumElements.put(Class.NUMERAL, "num");
menumElements.put(Class.SUBORDINATING_CONJUNCTION, "conj-s");
menumElements.put(Class.COORDINATING_CONJUNCTION, "conj-c");
menumElements.put(Class.INTERJECTION, "intj");
menumElements.put(Class.PUNCTUATION_MARK, "pnt"); // ?
// added
menumElements.put(Class.FINITIVE_VERB, "v-fin");
menumElements.put(Class.INFINITIVE_VERB, "v-inf");
menumElements.put(Class.PARTICIPLE_VERB, "v-pcp");
menumElements.put(Class.GERUND_VERB, "v-ger");
menumElements.put(Class.PREFIX, "ec");
menumElements.put(Class.NOUN_ADJECTIVE, "n-adj");
menumElements.put(Class.PRONOUN, "pron"); // many
menumElements.put(Class.PERSONAL_PRONOUN, "pron-pers");
// removed
// menumElements.put(Class.UNIT, "uni");//?
// menumElements.put(Class.HYPHEN_SEPARATED_PREFIX, "ec");
// menumElements.put(Class.VERB, "v-"); //? v-*
// menumElements.put(Class.DETERMINER_PRONOUN, "pron-det");//collision
// menumElements.put(Class.SPECIFIER, "pron-indp");//?
// menumElements.put(Class.DETERMINER, "det");//collision
/* Gender */
menumElements.put(Gender.MALE, "M");
menumElements.put(Gender.FEMALE, "F");
menumElements.put(Gender.NEUTRAL, "M/F");
/* Number */
menumElements.put(Number.SINGULAR, "S");
menumElements.put(Number.PLURAL, "P");
menumElements.put(Number.NEUTRAL, "S/P");
/* Case */
menumElements.put(Case.ACCUSATIVE, "ACC");
menumElements.put(Case.DATIVE, "DAT");
menumElements.put(Case.NOMINATIVE, "NOM");
menumElements.put(Case.PREPOSITIVE, "PIV");
menumElements.put(Case.ACCUSATIVE_DATIVE, "ACC/DAT");
menumElements.put(Case.NOMINATIVE_PREPOSITIVE, "NOM/PIV");
/* Person */
menumElements.put(Person.FIRST, "1");
menumElements.put(Person.SECOND, "2");
menumElements.put(Person.THIRD, "3");
// enumElements.put(Person.FIRST, "1S");
// enumElements.put(Person.FIRST, "1P");
// enumElements.put(Person.SECOND, "2S");
// enumElements.put(Person.SECOND, "2P");
// enumElements.put(Person.THIRD, "3S");
// enumElements.put(Person.THIRD, "3P");
menumElements.put(Person.FIRST_THIRD, "1/3S");
// enumElements.put(Person.THIRD, "3S/P");
menumElements.put(Person.NONE_FIRST_THIRD, "0/1/3S");
/* Tense */
menumElements.put(Tense.PRESENT, "PR");
menumElements.put(Tense.PRETERITO_IMPERFEITO, "IMPF");
menumElements.put(Tense.PRETERITO_PERFEITO, "PS");
menumElements.put(Tense.PRETERITO_MAIS_QUE_PERFEITO, "MQP");
menumElements.put(Tense.FUTURE, "FUT");
menumElements.put(Tense.CONDITIONAL, "COND");
menumElements.put(Tense.PRETERITO_PERFEITO_MAIS_QUE_PERFEITO, "PS/MQP");
/* Mood */
menumElements.put(Mood.INDICATIVE, "IND");
menumElements.put(Mood.SUBJUNCTIVE, "SUBJ");
menumElements.put(Mood.IMPERATIVE, "IMP");
/* Punctuation */
menumElements.put(Punctuation.ABS, "ABS");
menumElements.put(Punctuation.NSEP, "NSEP");
menumElements.put(Punctuation.BIN, "BIN");
menumElements.put(Punctuation.REL, "REL");
ENUM_MTAG_PARTS = Collections.unmodifiableMap(menumElements);
Set<Enum<?>> k2 = ENUM_MTAG_PARTS.keySet();
Map<String, List<Enum<?>>> stringMElements = new HashMap<String, List<Enum<?>>>(
60);
for (Enum<?> tagE : k2) {
ArrayList<Enum<?>> values = new ArrayList<Enum<?>>();
values.add(tagE);
stringMElements.put(ENUM_MTAG_PARTS.get(tagE),
Collections.unmodifiableList(values));
}
// enumElements.put(Person.FIRST, "1S");
// enumElements.put(Person.FIRST, "1P");
// enumElements.put(Person.SECOND, "2S");
// enumElements.put(Person.SECOND, "2P");
// enumElements.put(Person.THIRD, "3S");
// enumElements.put(Person.THIRD, "3P");
// * enumElements.put(Person.FIRST_THIRD, "1/3S");
// enumElements.put(Person.THIRD, "3S/P");
// * enumElements.put(Person.NONE_FIRST_THIRD, "0/1/3S");
ArrayList<Enum<?>> _1S = new ArrayList<Enum<?>>();
_1S.add(Person.FIRST);
_1S.add(Number.SINGULAR);
stringMElements.put("1S", Collections.unmodifiableList(_1S));
ArrayList<Enum<?>> _1P = new ArrayList<Enum<?>>();
_1P.add(Person.FIRST);
_1P.add(Number.PLURAL);
stringMElements.put("1P", Collections.unmodifiableList(_1P));
ArrayList<Enum<?>> _2S = new ArrayList<Enum<?>>();
_2S.add(Person.SECOND);
_2S.add(Number.SINGULAR);
stringMElements.put("2S", Collections.unmodifiableList(_2S));
ArrayList<Enum<?>> _2P = new ArrayList<Enum<?>>();
_2P.add(Person.SECOND);
_2P.add(Number.PLURAL);
stringMElements.put("2P", Collections.unmodifiableList(_2P));
ArrayList<Enum<?>> _3S = new ArrayList<Enum<?>>();
_3S.add(Person.THIRD);
_3S.add(Number.SINGULAR);
stringMElements.put("3S", Collections.unmodifiableList(_3S));
ArrayList<Enum<?>> _3P = new ArrayList<Enum<?>>();
_3P.add(Person.THIRD);
_3P.add(Number.PLURAL);
stringMElements.put("3P", Collections.unmodifiableList(_3P));
ArrayList<Enum<?>> _13S = new ArrayList<Enum<?>>();
_13S.add(Person.FIRST_THIRD);
_13S.add(Number.SINGULAR);
stringMElements.put("1/3S", Collections.unmodifiableList(_13S));
ArrayList<Enum<?>> _3SP = new ArrayList<Enum<?>>();
_3SP.add(Person.THIRD);
_3SP.add(Number.NEUTRAL);
stringMElements.put("3S/P", Collections.unmodifiableList(_3SP));
ArrayList<Enum<?>> _013S = new ArrayList<Enum<?>>();
_013S.add(Person.NONE_FIRST_THIRD);
_013S.add(Number.SINGULAR);
stringMElements.put("0/1/3S", Collections.unmodifiableList(_013S));
/* weird things */
ArrayList<Enum<?>> hifen = new ArrayList<Enum<?>>();
hifen.add(Class.PUNCTUATION_MARK);
hifen.add(Punctuation.REL);
stringMElements.put("$--", Collections.unmodifiableList(hifen));
ArrayList<Enum<?>> ap = new ArrayList<Enum<?>>();
ap.add(Class.PUNCTUATION_MARK);
ap.add(Punctuation.BIN);
stringMElements.put("$`", Collections.unmodifiableList(ap));
stringMElements.put("$´", Collections.unmodifiableList(ap));
ArrayList<Enum<?>> others = new ArrayList<Enum<?>>();
others.add(Class.PUNCTUATION_MARK);
others.add(Punctuation.NSEP);
stringMElements.put("$+", Collections.unmodifiableList(others));
stringMElements.put("$±", Collections.unmodifiableList(others));
stringMElements.put("$=", Collections.unmodifiableList(others));
stringMElements.put("$$", Collections.unmodifiableList(others));
stringMElements.put("$\\", Collections.unmodifiableList(others));
ArrayList<Enum<?>> pronouns = new ArrayList<Enum<?>>();
pronouns.add(Class.PRONOUN);
stringMElements.put("pron-det", Collections.unmodifiableList(pronouns));
stringMElements.put("pron-indp", Collections.unmodifiableList(pronouns));
ArrayList<Enum<?>> pp = new ArrayList<Enum<?>>();
pp.add(Class.PREPOSITION);
stringMElements.put("PP", Collections.unmodifiableList(pp));
MTAG_PARTS_ENUM = Collections.unmodifiableMap(stringMElements);
}
public FlorestaTagInterpreter() {
}
// private final Map<String, MorphologicalTag> cache = new HashMap<String,
// MorphologicalTag>();
private final Cache cache = new Cache(200);
public MorphologicalTag parseMorphologicalTag(String tagString) {
if (tagString == null) {
return null;
}
synchronized (cache) {
if (cache.containsKey(tagString)) {
return ((MorphologicalTag) cache.get(tagString)).clone();
}
}
if(tagString.endsWith("#-")) {
tagString = tagString.substring(0, tagString.length() - 2);
}
MorphologicalTag m = new MorphologicalTag();
String[] tags = tagString.split("[#=]");
for (String tag : tags) {
if (MTAG_PARTS_ENUM.containsKey(tag)) {
List<Enum<?>> tagE = MTAG_PARTS_ENUM.get(tag);
for (Enum<?> t : tagE) {
if (t instanceof Class) {
m.setClazz((Class) t);
} else if (t instanceof Gender) {
m.setGender((Gender) t);
} else if (t instanceof Number) {
m.setNumber((Number) t);
} else if (t instanceof Case) {
m.setCase((Case) t);
} else if (t instanceof Person) {
m.setPerson((Person) t);
} else if (t instanceof Tense) {
m.setTense((Tense) t);
} else if (t instanceof Mood) {
m.setMood((Mood) t);
} else if (t instanceof Punctuation) {
m.setPunctuation((Punctuation) t);
}
}
} else {
if (tag.length() == 1 || "--".equals(tag) || "...".equals(tag)) {
m.setClazz(Class.PUNCTUATION_MARK);
if(",".equals(tag)) {
m.setPunctuation(Punctuation.NSEP);
} else if(".".equals(tag) || "!".equals(tag) || "?".equals(tag)) {
m.setPunctuation(Punctuation.ABS);
} else if("(".equals(tag) || ")".equals(tag)) {
m.setPunctuation(Punctuation.BIN);
} else {
m.setPunctuation(Punctuation.REL);
}
} else if ("n:".equals(tag)) {
m.setClazz(Class.NOUN);
} else if ("intj".equals(tag)) {
m.setClazz(Class.INTERJECTION);
} else if ("pp".equals(tag)) {
m.setClazz(Class.PREPOSITION);
} else if ("np".equals(tag)) {
m.setClazz(Class.NOUN);
} else if ("vp".equals(tag)) {
m.setClazz(Class.INFINITIVE_VERB);
} else if (tag.contains("<") || "P.vp".equals(tag) || "GER".equals(tag)) {
// garbage
} else {
System.out.println(tag);
}
}
}
if (m.toString() == null || m.toString().length() == 0) {
LOGGER.error("Invalid MorphologicalTag: " + tagString);
}
synchronized (cache) {
if (!cache.containsKey(tagString)) {
cache.put(tagString, m.clone());
}
}
return m;
}
public ChunkTag parseChunkTag(String tagString) {
ChunkTag ct = new ChunkTag();
// ct.setChunkFunction(ChunkFunction.valueOf(tagString));
List<ChunkFunction> tag = CTAG_PARTS_ENUM.get(tagString);
if (tag != null && tag.size() != 0) {
ChunkFunction en = tag.get(0);
ct.setChunkFunction(en);
} else {
ct.setChunkFunction(ChunkFunction.OTHER);
if(LOGGER.isInfoEnabled()) {
LOGGER.error("Invalid ChunkTag: " + tagString);
}
}
return ct;
}
public SyntacticTag parseSyntacticTag(String tagString) {
SyntacticTag st = new SyntacticTag();
// ct.setChunkFunction(ChunkFunction.valueOf(tagString));
List<SyntacticFunction> tag = STAG_PARTS_ENUM.get(tagString);
if (tag != null && tag.size() != 0) {
SyntacticFunction en = tag.get(0);
st.setSyntacticFunction(en);
} else {
st.setSyntacticFunction(SyntacticFunction.NONE);
if(LOGGER.isInfoEnabled()) {
LOGGER.info("Invalid ChunkTag: " + tagString);
}
}
return st;
}
public String serialize(MorphologicalTag tag) {
StringBuilder res = new StringBuilder();
if (tag.getClazzE() != null) {
res.append(serializer(tag.getClazzE()) + SEP);
}
if (tag.getGenderE() != null) {
res.append(serializer(tag.getGenderE()) + SEP);
}
if (tag.getTense() != null) {
res.append(serializer(tag.getTense()) + SEP);
}
if (tag.getMood() != null && tag.getTense() == null) {
res.append(serializer(tag.getMood()) + SEP);
}
if (tag.getPersonE() != null && tag.getNumberE() != null) {
String s = serializer(tag.getPersonE());
if (!(s.contains("S") || s.contains("P"))) {
res.append(s + serializer(tag.getNumberE()) + SEP);
} else {
res.append(s + SEP);
}
} else if (tag.getNumberE() != null) {
res.append(serializer(tag.getNumberE()) + SEP);
}
if (tag.getCase() != null) {
res.append(serializer(tag.getCase()) + SEP);
}
if (tag.getMood() != null && tag.getTense() != null) {
res.append(serializer(tag.getMood()) + SEP);
}
// if(tag.getFinitenessE() != null) {
// res.append(serializer(tag.getFinitenessE()) + SEP);
// }
if (tag.getPunctuation() != null) {
res.append(serializer(tag.getPunctuation()) + SEP);
}
if (res.length() == 0) {
LOGGER.error("Unable to serialize MorphologicalTag: " + tag);
}
if (res.length() > 1) {
return res.substring(0, res.length() - 1);
} else {
return null;
}
}
public String serialize(ChunkTag tag) {
String value = serializer(tag.getChunkFunction());
return value;
}
public String serialize(SyntacticTag tag) {
String value = serializer(tag.getSyntacticFunction());
return value;
}
public String serialize(SyntacticFunction tag) {
return serializer(tag);
}
public String serialize(ChunkFunction tag) {
return serializer(tag);
}
public String serialize(Class tag) {
return serializer(tag);
}
public String serialize(Gender tag) {
return serializer(tag);
}
public String serialize(Number tag) {
return serializer(tag);
}
public String serialize(Case tag) {
return serializer(tag);
}
public String serialize(Person tag) {
return serializer(tag);
}
public String serialize(Tense tag) {
return serializer(tag);
}
public String serialize(Mood tag) {
return serializer(tag);
}
public String serialize(Punctuation tag) {
return serializer(tag);
}
private String serializer(Enum<?> value) {
if (ENUM_MTAG_PARTS.containsKey(value)) {
return ENUM_MTAG_PARTS.get(value);
}
return "";
}
private String serializer(SyntacticFunction value) {
if (ENUM_STAG_PARTS.containsKey(value)) {
return ENUM_STAG_PARTS.get(value);
}
return "";
}
private String serializer(ChunkFunction value) {
if (ENUM_CTAG_PARTS.containsKey(value)) {
return ENUM_CTAG_PARTS.get(value);
}
return "";
}
}