/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cogroo.uima.interpreters;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import opennlp.tools.util.Cache;
import org.apache.log4j.Logger;
import br.usp.pcs.lta.cogroo.entity.impl.runtime.ChunkTag;
import br.usp.pcs.lta.cogroo.entity.impl.runtime.MorphologicalTag;
import br.usp.pcs.lta.cogroo.entity.impl.runtime.SyntacticTag;
import br.usp.pcs.lta.cogroo.tag.TagInterpreterI;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Case;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.ChunkFunction;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Class;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Finiteness;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Gender;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Mood;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Number;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Person;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Punctuation;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.SyntacticFunction;
import br.usp.pcs.lta.cogroo.tools.checker.rules.model.TagMask.Tense;
public class FlorestaTagInterpreter implements TagInterpreterI {
private static final Map<Enum<?>, String> ENUM_MTAG_PARTS;
private static final Map<String, List<Enum<?>>> MTAG_PARTS_ENUM;
private static final Map<ChunkFunction, String> ENUM_CTAG_PARTS;
private static final Map<String, List<ChunkFunction>> CTAG_PARTS_ENUM;
private static final Map<SyntacticFunction, String> ENUM_STAG_PARTS;
private static final Map<String, List<SyntacticFunction>> STAG_PARTS_ENUM;
protected static final Logger LOGGER = Logger
.getLogger(FlorestaTagInterpreter.class);
private static final String SEP = "=";
static {
/* ********************************
* Chunk /* *******************************
*/
Map<ChunkFunction, String> cenumElements = new HashMap<ChunkFunction, String>(
7);
cenumElements.put(ChunkFunction.OTHER, "O");
cenumElements.put(ChunkFunction.BOUNDARY_NOUN_PHRASE, "B-NP");
cenumElements.put(ChunkFunction.BOUNDARY_NOUN_PHRASE_MAIN, "*B-NP");
cenumElements.put(ChunkFunction.INTERMEDIARY_NOUN_PHRASE, "I-NP");
cenumElements.put(ChunkFunction.INTERMEDIARY_NOUN_PHRASE_MAIN, "*I-NP");
cenumElements.put(ChunkFunction.BOUNDARY_VERB_PHRASE_MAIN, "*B-VP");
cenumElements.put(ChunkFunction.INTERMEDIARY_VERB_PHRASE, "I-VP");
ENUM_CTAG_PARTS = Collections.unmodifiableMap(cenumElements);
Set<ChunkFunction> k = ENUM_CTAG_PARTS.keySet();
Map<String, List<ChunkFunction>> stringCElements = new HashMap<String, List<ChunkFunction>>(
7);
for (ChunkFunction tagE : k) {
ArrayList<ChunkFunction> values = new ArrayList<ChunkFunction>();
values.add(tagE);
stringCElements.put(ENUM_CTAG_PARTS.get(tagE),
Collections.unmodifiableList(values));
}
CTAG_PARTS_ENUM = Collections.unmodifiableMap(stringCElements);
/* ********************************
* Syntactic /* *******************************
*/
Map<SyntacticFunction, String> senumElements = new HashMap<SyntacticFunction, String>(
3);
senumElements.put(SyntacticFunction.NONE, "-");
senumElements.put(SyntacticFunction.SUBJECT, "SUBJ");
senumElements.put(SyntacticFunction.VERB, "VERB");
ENUM_STAG_PARTS = Collections.unmodifiableMap(senumElements);
Set<SyntacticFunction> k1 = ENUM_STAG_PARTS.keySet();
Map<String, List<SyntacticFunction>> stringSElements = new HashMap<String, List<SyntacticFunction>>(
3);
for (SyntacticFunction tagE : k1) {
ArrayList<SyntacticFunction> values = new ArrayList<SyntacticFunction>();
values.add(tagE);
stringSElements.put(ENUM_STAG_PARTS.get(tagE),
Collections.unmodifiableList(values));
}
STAG_PARTS_ENUM = Collections.unmodifiableMap(stringSElements);
/* ********************************
* Morphologic /* *******************************
*/
Map<Enum<?>, String> menumElements = new HashMap<Enum<?>, String>();
/* Class */
menumElements.put(Class.NOUN, "n");
menumElements.put(Class.PROPER_NOUN, "prop");
menumElements.put(Class.SPECIFIER, "pron-indp");// ?
menumElements.put(Class.DETERMINER, "art");// collision
// menumElements.put(Class.ARTICLE, "art");//collision
// menumElements.put(Class.DETERMINER_PRONOUN, "pron-det");//collision
menumElements.put(Class.PERSONAL_PRONOUN, "pron-pers");
menumElements.put(Class.PREPOSITION, "prp");
menumElements.put(Class.ADJECTIVE, "adj");
menumElements.put(Class.ADVERB, "adv");
//menumElements.put(Class.VERB, "v-"); //? v-*
menumElements.put(Class.NUMERAL, "num");
menumElements.put(Class.SUBORDINATING_CONJUNCTION, "conj-s");
menumElements.put(Class.COORDINATING_CONJUNCTION, "conj-c");
menumElements.put(Class.INTERJECTION, "intj");
menumElements.put(Class.HYPHEN_SEPARATED_PREFIX, "ec");
menumElements.put(Class.PUNCTUATION_MARK, "pnt"); // ?
menumElements.put(Class.UNIT, "uni");// ?
/* Gender */
menumElements.put(Gender.MALE, "M");
menumElements.put(Gender.FEMALE, "F");
menumElements.put(Gender.NEUTRAL, "M/F");
/* Number */
menumElements.put(Number.SINGULAR, "S");
menumElements.put(Number.PLURAL, "P");
menumElements.put(Number.NEUTRAL, "S/P");
/* Case */
menumElements.put(Case.ACCUSATIVE, "ACC");
menumElements.put(Case.DATIVE, "DAT");
menumElements.put(Case.NOMINATIVE, "NOM");
menumElements.put(Case.PREPOSITIVE, "PIV");
menumElements.put(Case.ACCUSATIVE_DATIVE, "ACC/DAT");
menumElements.put(Case.NOMINATIVE_PREPOSITIVE, "NOM/PIV");
/* Person */
menumElements.put(Person.FIRST, "1");
menumElements.put(Person.SECOND, "2");
menumElements.put(Person.THIRD, "3");
// enumElements.put(Person.FIRST, "1S");
// enumElements.put(Person.FIRST, "1P");
// enumElements.put(Person.SECOND, "2S");
// enumElements.put(Person.SECOND, "2P");
// enumElements.put(Person.THIRD, "3S");
// enumElements.put(Person.THIRD, "3P");
menumElements.put(Person.FIRST_THIRD, "1/3S");
// enumElements.put(Person.THIRD, "3S/P");
menumElements.put(Person.NONE_FIRST_THIRD, "0/1/3S");
/* Tense */
menumElements.put(Tense.PRESENT, "PR");
menumElements.put(Tense.PRETERITO_IMPERFEITO, "IMPF");
menumElements.put(Tense.PRETERITO_PERFEITO, "PS");
menumElements.put(Tense.PRETERITO_MAIS_QUE_PERFEITO, "MQP");
menumElements.put(Tense.FUTURE, "FUT");
menumElements.put(Tense.CONDITIONAL, "COND");
menumElements.put(Tense.PRETERITO_PERFEITO_MAIS_QUE_PERFEITO, "PS/MQP");
/* Mood */
menumElements.put(Mood.INDICATIVE, "IND");
menumElements.put(Mood.SUBJUNCTIVE, "SUBJ");
menumElements.put(Mood.IMPERATIVE, "IMP");
/* Finiteness */
// menumElements.put(Finiteness.FINITE, "v-fin");
// menumElements.put(Finiteness.INFINITIVE, "v-inf");
// menumElements.put(Finiteness.PARTICIPLE, "v-pcp");
// menumElements.put(Finiteness.GERUND, "v-ger");
/* Punctuation */
menumElements.put(Punctuation.ABS, "ABS");
menumElements.put(Punctuation.NSEP, "NSEP");
menumElements.put(Punctuation.BIN, "BIN");
menumElements.put(Punctuation.REL, "REL");
ENUM_MTAG_PARTS = Collections.unmodifiableMap(menumElements);
Set<Enum<?>> k2 = ENUM_MTAG_PARTS.keySet();
Map<String, List<Enum<?>>> stringMElements = new HashMap<String, List<Enum<?>>>(
60);
for (Enum<?> tagE : k2) {
ArrayList<Enum<?>> values = new ArrayList<Enum<?>>();
values.add(tagE);
stringMElements.put(ENUM_MTAG_PARTS.get(tagE),
Collections.unmodifiableList(values));
}
// enumElements.put(Person.FIRST, "1S");
// enumElements.put(Person.FIRST, "1P");
// enumElements.put(Person.SECOND, "2S");
// enumElements.put(Person.SECOND, "2P");
// enumElements.put(Person.THIRD, "3S");
// enumElements.put(Person.THIRD, "3P");
// * enumElements.put(Person.FIRST_THIRD, "1/3S");
// enumElements.put(Person.THIRD, "3S/P");
// * enumElements.put(Person.NONE_FIRST_THIRD, "0/1/3S");
ArrayList<Enum<?>> det = new ArrayList<Enum<?>>();
det.add(Class.DETERMINER);
stringMElements.put("pron-det", Collections.unmodifiableList(det));
ArrayList<Enum<?>> pcp = new ArrayList<Enum<?>>();
pcp.add(Class.VERB);
pcp.add(Finiteness.PARTICIPLE);
stringMElements.put("v-pcp", Collections.unmodifiableList(pcp));
ArrayList<Enum<?>> inf = new ArrayList<Enum<?>>();
inf.add(Class.VERB);
inf.add(Finiteness.INFINITIVE);
stringMElements.put("v-inf", Collections.unmodifiableList(inf));
ArrayList<Enum<?>> ger = new ArrayList<Enum<?>>();
ger.add(Class.VERB);
ger.add(Finiteness.GERUND);
stringMElements.put("v-ger", Collections.unmodifiableList(ger));
ArrayList<Enum<?>> fin = new ArrayList<Enum<?>>();
fin.add(Class.VERB);
fin.add(Finiteness.FINITE);
stringMElements.put("v-fin", Collections.unmodifiableList(fin));
ArrayList<Enum<?>> nadj = new ArrayList<Enum<?>>();
nadj.add(Class.ADJECTIVE);
stringMElements.put("n-adj", Collections.unmodifiableList(nadj));
ArrayList<Enum<?>> _1S = new ArrayList<Enum<?>>();
_1S.add(Person.FIRST);
_1S.add(Number.SINGULAR);
stringMElements.put("1S", Collections.unmodifiableList(_1S));
ArrayList<Enum<?>> _1P = new ArrayList<Enum<?>>();
_1P.add(Person.FIRST);
_1P.add(Number.PLURAL);
stringMElements.put("1P", Collections.unmodifiableList(_1P));
ArrayList<Enum<?>> _2S = new ArrayList<Enum<?>>();
_2S.add(Person.SECOND);
_2S.add(Number.SINGULAR);
stringMElements.put("2S", Collections.unmodifiableList(_2S));
ArrayList<Enum<?>> _2P = new ArrayList<Enum<?>>();
_2P.add(Person.SECOND);
_2P.add(Number.PLURAL);
stringMElements.put("2P", Collections.unmodifiableList(_2P));
ArrayList<Enum<?>> _3S = new ArrayList<Enum<?>>();
_3S.add(Person.THIRD);
_3S.add(Number.SINGULAR);
stringMElements.put("3S", Collections.unmodifiableList(_3S));
ArrayList<Enum<?>> _3P = new ArrayList<Enum<?>>();
_3P.add(Person.THIRD);
_3P.add(Number.PLURAL);
stringMElements.put("3P", Collections.unmodifiableList(_3P));
ArrayList<Enum<?>> _13S = new ArrayList<Enum<?>>();
_13S.add(Person.FIRST_THIRD);
_13S.add(Number.SINGULAR);
stringMElements.put("1/3S", Collections.unmodifiableList(_13S));
ArrayList<Enum<?>> _3SP = new ArrayList<Enum<?>>();
_3SP.add(Person.THIRD);
_3SP.add(Number.NEUTRAL);
stringMElements.put("3S/P", Collections.unmodifiableList(_3SP));
ArrayList<Enum<?>> _013S = new ArrayList<Enum<?>>();
_013S.add(Person.NONE_FIRST_THIRD);
_013S.add(Number.SINGULAR);
stringMElements.put("0/1/3S", Collections.unmodifiableList(_013S));
/* weird things */
ArrayList<Enum<?>> hifen = new ArrayList<Enum<?>>();
hifen.add(Class.PUNCTUATION_MARK);
// stringMElements.put(".", Collections.unmodifiableList(hifen));
hifen.add(Punctuation.REL);
stringMElements.put("$--", Collections.unmodifiableList(hifen));
ArrayList<Enum<?>> ap = new ArrayList<Enum<?>>();
ap.add(Class.PUNCTUATION_MARK);
ap.add(Punctuation.BIN);
stringMElements.put("$`", Collections.unmodifiableList(ap));
stringMElements.put("$´", Collections.unmodifiableList(ap));
ArrayList<Enum<?>> nSep = new ArrayList<Enum<?>>();
nSep.add(Class.PUNCTUATION_MARK);
nSep.add(Punctuation.NSEP);
stringMElements.put(",", Collections.unmodifiableList(nSep));
stringMElements.put("$\\", Collections.unmodifiableList(nSep));
stringMElements.put("«", Collections.unmodifiableList(nSep));
stringMElements.put("»", Collections.unmodifiableList(nSep));
stringMElements.put("\"", Collections.unmodifiableList(nSep));
stringMElements.put("\'", Collections.unmodifiableList(nSep));
ArrayList<Enum<?>> abs = new ArrayList<Enum<?>>();
abs.add(Class.PUNCTUATION_MARK);
abs.add(Punctuation.ABS);
stringMElements.put(".", Collections.unmodifiableList(abs));
stringMElements.put("!", Collections.unmodifiableList(abs));
stringMElements.put("?", Collections.unmodifiableList(abs));
ArrayList<Enum<?>> rel = new ArrayList<Enum<?>>();
rel.add(Class.PUNCTUATION_MARK);
rel.add(Punctuation.REL);
stringMElements.put(";", Collections.unmodifiableList(rel));
stringMElements.put(":", Collections.unmodifiableList(rel));
stringMElements.put("...", Collections.unmodifiableList(rel));
stringMElements.put("-", Collections.unmodifiableList(rel));
ArrayList<Enum<?>> bin = new ArrayList<Enum<?>>();
bin.add(Class.PUNCTUATION_MARK);
bin.add(Punctuation.BIN);
stringMElements.put("(", Collections.unmodifiableList(bin));
stringMElements.put(")", Collections.unmodifiableList(bin));
stringMElements.put("--", Collections.unmodifiableList(bin));
ArrayList<Enum<?>> pp = new ArrayList<Enum<?>>();
pp.add(Class.PREPOSITION);
stringMElements.put("PP", Collections.unmodifiableList(pp));
MTAG_PARTS_ENUM = Collections.unmodifiableMap(stringMElements);
}
public FlorestaTagInterpreter() {
}
// private final Map<String, MorphologicalTag> cache = new HashMap<String,
// MorphologicalTag>();
private final Cache cache = new Cache(200);
public MorphologicalTag parseMorphologicalTag(String tagString) {
if (tagString == null) {
return null;
}
synchronized (cache) {
if (cache.containsKey(tagString)) {
return ((MorphologicalTag) cache.get(tagString)).clone();
}
}
MorphologicalTag m = new MorphologicalTag();
String[] tags = tagString.split(SEP);
for (String tag : tags) {
if (MTAG_PARTS_ENUM.containsKey(tag)) {
List<Enum<?>> tagE = MTAG_PARTS_ENUM.get(tag);
for (Enum<?> t : tagE) {
if (t instanceof Class) {
m.setClazz((Class) t);
} else if (t instanceof Gender) {
m.setGender((Gender) t);
} else if (t instanceof Number) {
m.setNumber((Number) t);
} else if (t instanceof Case) {
m.setCase((Case) t);
} else if (t instanceof Person) {
m.setPerson((Person) t);
} else if (t instanceof Tense) {
m.setTense((Tense) t);
} else if (t instanceof Mood) {
m.setMood((Mood) t);
} else if (t instanceof Finiteness) {
m.setFiniteness((Finiteness) t);
} else if (t instanceof Punctuation) {
m.setPunctuation((Punctuation) t);
}
}
} else {
if (tag.length() == 1) {
m.setClazz(Class.PUNCTUATION_MARK);
} else if ("n:".equals(tag)) {
m.setClazz(Class.NOUN);
} else if ("pp".equals(tag)) {
m.setClazz(Class.PREPOSITION);
} else if ("np".equals(tag)) {
m.setClazz(Class.NOUN);
} else if ("vp".equals(tag)) {
m.setClazz(Class.VERB);
} else if (tag.contains("<") || "P.vp".equals(tag) || "GER".equals(tag)) {
// garbage
} else {
LOGGER.warn("Could not parse tag: " + tag);
}
}
}
if (m.toString() == null || m.toString().length() == 0) {
LOGGER.error("Invalid MorphologicalTag: " + tagString);
}
synchronized (cache) {
if (!cache.containsKey(tagString)) {
cache.put(tagString, m.clone());
}
}
return m;
}
public ChunkTag parseChunkTag(String tagString) {
ChunkTag ct = new ChunkTag();
// ct.setChunkFunction(ChunkFunction.valueOf(tagString));
List<ChunkFunction> tag = CTAG_PARTS_ENUM.get(tagString);
if (tag != null && tag.size() != 0) {
ChunkFunction en = tag.get(0);
ct.setChunkFunction(en);
} else {
ct.setChunkFunction(ChunkFunction.OTHER);
LOGGER.error("Invalid ChunkTag: " + tagString);
}
return ct;
}
public SyntacticTag parseSyntacticTag(String tagString) {
SyntacticTag st = new SyntacticTag();
// ct.setChunkFunction(ChunkFunction.valueOf(tagString));
List<SyntacticFunction> tag = STAG_PARTS_ENUM.get(tagString);
if (tag != null && tag.size() != 0) {
SyntacticFunction en = tag.get(0);
st.setSyntacticFunction(en);
} else {
st.setSyntacticFunction(SyntacticFunction.NONE);
LOGGER.error("Invalid ChunkTag: " + tagString);
}
return st;
}
public String serialize(MorphologicalTag tag) {
StringBuilder res = new StringBuilder();
if (tag.getClazzE() != null) {
if (tag.getClazzE().equals(Class.VERB)) {
if (tag.getFinitenessE().equals(Finiteness.FINITE)) {
res.append("v-fin" + SEP);
} else if (tag.getFinitenessE().equals(Finiteness.GERUND)) {
res.append("v-ger" + SEP);
} else if (tag.getFinitenessE().equals(Finiteness.INFINITIVE)) {
res.append("v-inf" + SEP);
} else if (tag.getFinitenessE().equals(Finiteness.PARTICIPLE)) {
res.append("v-pcp" + SEP);
} else {
throw new RuntimeException("Missing finiteness");
}
} else {
res.append(serializer(tag.getClazzE()) + SEP);
}
}
if (tag.getGenderE() != null) {
res.append(serializer(tag.getGenderE()) + SEP);
}
if (tag.getTense() != null) {
res.append(serializer(tag.getTense()) + SEP);
}
if (tag.getMood() != null && tag.getTense() == null) {
res.append(serializer(tag.getMood()) + SEP);
}
if (tag.getPersonE() != null && tag.getNumberE() != null) {
String s = serializer(tag.getPersonE());
if (!(s.contains("S") || s.contains("P"))) {
res.append(s + serializer(tag.getNumberE()) + SEP);
} else {
res.append(s + SEP);
}
} else if (tag.getNumberE() != null) {
res.append(serializer(tag.getNumberE()) + SEP);
}
if (tag.getCase() != null) {
res.append(serializer(tag.getCase()) + SEP);
}
if (tag.getMood() != null && tag.getTense() != null) {
res.append(serializer(tag.getMood()) + SEP);
}
// if(tag.getFinitenessE() != null) {
// res.append(serializer(tag.getFinitenessE()) + SEP);
// }
if (tag.getPunctuation() != null) {
res.append(serializer(tag.getPunctuation()) + SEP);
}
if (res.length() == 0) {
LOGGER.error("Unable to serialize MorphologicalTag: " + tag);
}
if (res.length() > 1) {
return res.substring(0, res.length() - 1);
} else {
return null;
}
}
public String serialize(ChunkTag tag) {
String value = serializer(tag.getChunkFunction());
return value;
}
public String serialize(SyntacticTag tag) {
String value = serializer(tag.getSyntacticFunction());
return value;
}
public String serialize(SyntacticFunction tag) {
return serializer(tag);
}
public String serialize(ChunkFunction tag) {
return serializer(tag);
}
public String serialize(Class tag) {
return serializer(tag);
}
public String serialize(Gender tag) {
return serializer(tag);
}
public String serialize(Number tag) {
return serializer(tag);
}
public String serialize(Case tag) {
return serializer(tag);
}
public String serialize(Person tag) {
return serializer(tag);
}
public String serialize(Tense tag) {
return serializer(tag);
}
public String serialize(Mood tag) {
return serializer(tag);
}
public String serialize(Finiteness tag) {
return serializer(tag);
}
public String serialize(Punctuation tag) {
return serializer(tag);
}
private String serializer(Enum<?> value) {
if (ENUM_MTAG_PARTS.containsKey(value)) {
return ENUM_MTAG_PARTS.get(value);
}
return "";
}
private String serializer(SyntacticFunction value) {
if (ENUM_STAG_PARTS.containsKey(value)) {
return ENUM_STAG_PARTS.get(value);
}
return "";
}
private String serializer(ChunkFunction value) {
if (ENUM_CTAG_PARTS.containsKey(value)) {
return ENUM_CTAG_PARTS.get(value);
}
return "";
}
// public static void main(String[] args) throws IOException {
// ADPOSSampleStream sampleStream = new ADPOSSampleStream(
// CmdLineUtil.openInFile(new
// File("/Users/wcolen/Documents/wrks/corpus/FlorestaVirgem/FlorestaVirgem_CF_3.0_ad.txt")),
// "ISO-8859-1");
// FlorestaTagInterpreter inter = new FlorestaTagInterpreter();
// POSSample sample = sampleStream.read();
// while(sample != null) {
// for(int i = 0; i < sample.getTags().length; i++) {
// String ori = sample.getTags()[i];
// String con = inter.serialize(inter.parseMorphologicalTag(ori));
// if(con == null) {
// System.err.println("null: " + ori);
// } else if(!con.equals(ori) && !con.contains("pnt") &&
// !ori.contains("n-adj")) {
// System.out.println(ori + " > " + con);
// }
// }
// sample = sampleStream.read();
// }
// }
}