package edu.stanford.nlp.international.spanish.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
/**
* Clean up an AnCora treebank which has been processed to expand multi-word
* tokens into separate leaves. (This prior splitting task is performed by
* {@link SpanishTreeNormalizer} through the {@link SpanishXMLTreeReader}
* class).
*
* @author Jon Gauthier
* @author Spence Green (original French version)
*/
public final class MultiWordPreprocessor {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(MultiWordPreprocessor.class);
private static int nMissingPOS;
private static int nMissingPhrasal;
private static int nFixedPOS;
private static int nFixedPhrasal;
/**
* If a multiword token has a part-of-speech tag matching a key of
* this map, the constituent heading the split expression should
* have a label with the value corresponding to said key.
*
* e.g., since `(rg, grup.adv)` is in this map, we will eventually
* convert
*
* (rg cerca_de)
*
* to
*
* (grup.adv (rg cerca) (sp000 de))
*/
private static Map<String, String> phrasalCategoryMap = new HashMap<>();
static {
phrasalCategoryMap.put("ao0000", "grup.a");
phrasalCategoryMap.put("aq0000", "grup.a");
phrasalCategoryMap.put("aqo000", "grup.a");
phrasalCategoryMap.put("da0000", "spec");
phrasalCategoryMap.put("di0000", "sn");
phrasalCategoryMap.put("dn0000", "spec");
phrasalCategoryMap.put("dt0000", "spec");
phrasalCategoryMap.put("i", "interjeccio");
phrasalCategoryMap.put("i00", "interjeccio");
phrasalCategoryMap.put("rg", "grup.adv");
phrasalCategoryMap.put("rn", "grup.adv"); // no sólo
phrasalCategoryMap.put("vaip000", "grup.verb");
phrasalCategoryMap.put("vmg0000", "grup.verb");
phrasalCategoryMap.put("vmic000", "grup.verb");
phrasalCategoryMap.put("vmii000", "grup.verb");
phrasalCategoryMap.put("vmif000", "grup.verb");
phrasalCategoryMap.put("vmip000", "grup.verb");
phrasalCategoryMap.put("vmis000", "grup.verb");
phrasalCategoryMap.put("vmm0000", "grup.verb");
phrasalCategoryMap.put("vmn0000", "grup.verb");
phrasalCategoryMap.put("vmp0000", "grup.verb");
phrasalCategoryMap.put("vmsi000", "grup.verb");
phrasalCategoryMap.put("vmsp000", "grup.verb");
phrasalCategoryMap.put("zm", "grup.nom");
// New groups (not from AnCora specification)
phrasalCategoryMap.put("cc", "grup.cc");
phrasalCategoryMap.put("cs", "grup.cs");
phrasalCategoryMap.put("pn000000", "grup.nom");
phrasalCategoryMap.put("pi000000", "grup.pron");
phrasalCategoryMap.put("pr000000", "grup.pron");
phrasalCategoryMap.put("pt000000", "grup.pron");
phrasalCategoryMap.put("px000000", "grup.pron");
phrasalCategoryMap.put("sp000", "grup.prep");
phrasalCategoryMap.put("w", "grup.w");
phrasalCategoryMap.put("z", "grup.z");
phrasalCategoryMap.put("z0", "grup.z");
phrasalCategoryMap.put("zp", "grup.z");
phrasalCategoryMap.put("zu", "grup.z");
}
private static class ManualUWModel {
private static Map<String, String> posMap = new HashMap<>();
static {
// i.e., "metros cúbicos"
posMap.put("cúbico", "aq0000");
posMap.put("cúbicos", "aq0000");
posMap.put("diagonal", "aq0000");
posMap.put("diestro", "aq0000");
posMap.put("llevados", "aq0000"); // llevados a cabo
posMap.put("llevadas", "aq0000"); // llevadas a cabo
posMap.put("menudo", "aq0000");
posMap.put("obstante", "aq0000");
posMap.put("rapadas", "aq0000"); // cabezas rapadas
posMap.put("rasa", "aq0000");
posMap.put("súbito", "aq0000");
posMap.put("temática", "aq0000");
posMap.put("tuya", "px000000");
// foreign words
posMap.put("alter", "nc0s000");
posMap.put("ego", "nc0s000");
posMap.put("Jet", "nc0s000");
posMap.put("lag", "nc0s000");
posMap.put("line", "nc0s000");
posMap.put("lord", "nc0s000");
posMap.put("model", "nc0s000");
posMap.put("mortem", "nc0s000"); // post-mortem
posMap.put("pater", "nc0s000"); // pater familias
posMap.put("pipe", "nc0s000");
posMap.put("play", "nc0s000");
posMap.put("pollastre", "nc0s000");
posMap.put("post", "nc0s000");
posMap.put("power", "nc0s000");
posMap.put("priori", "nc0s000");
posMap.put("rock", "nc0s000");
posMap.put("roll", "nc0s000");
posMap.put("salubritatis", "nc0s000");
posMap.put("savoir", "nc0s000");
posMap.put("service", "nc0s000");
posMap.put("status", "nc0s000");
posMap.put("stem", "nc0s000");
posMap.put("street", "nc0s000");
posMap.put("task", "nc0s000");
posMap.put("trio", "nc0s000");
posMap.put("zigzag", "nc0s000");
// foreign words (invariable)
posMap.put("mass", "nc0n000");
posMap.put("media", "nc0n000");
// foreign words (plural)
posMap.put("options", "nc0p000");
// compound words, other invariables
posMap.put("regañadientes", "nc0n000");
posMap.put("sabiendas", "nc0n000"); // a sabiendas (de)
// common gender
posMap.put("virgen", "nc0s000");
posMap.put("merced", "ncfs000");
posMap.put("miel", "ncfs000");
posMap.put("torera", "ncfs000");
posMap.put("ultranza", "ncfs000");
posMap.put("vísperas", "ncfs000");
posMap.put("acecho", "ncms000");
posMap.put("alzamiento", "ncms000");
posMap.put("bordo", "ncms000");
posMap.put("cápita", "ncms000");
posMap.put("ciento", "ncms000");
posMap.put("cuño", "ncms000");
posMap.put("pairo", "ncms000");
posMap.put("pese", "ncms000"); // pese a
posMap.put("pique", "ncms000");
posMap.put("pos", "ncms000");
posMap.put("postre", "ncms000");
posMap.put("pro", "ncms000");
posMap.put("ralentí", "ncms000");
posMap.put("ras", "ncms000");
posMap.put("rebato", "ncms000");
posMap.put("torno", "ncms000");
posMap.put("través", "ncms000");
posMap.put("creces", "ncfp000");
posMap.put("cuestas", "ncfp000");
posMap.put("oídas", "ncfp000");
posMap.put("tientas", "ncfp000");
posMap.put("trizas", "ncfp000");
posMap.put("veras", "ncfp000");
posMap.put("abuelos", "ncmp000");
posMap.put("ambages", "ncmp000");
posMap.put("modos", "ncmp000");
posMap.put("pedazos", "ncmp000");
posMap.put("A", "sps00");
posMap.put("amén", "rg"); // amén de
posMap.put("Bailando", "vmg0000");
posMap.put("Soñando", "vmg0000");
posMap.put("Teniendo", "vmg0000");
posMap.put("echaremos", "vmif000");
posMap.put("formaba", "vmii000");
posMap.put("Formabas", "vmii000");
posMap.put("Forman", "vmip000");
posMap.put("perece", "vmip000");
posMap.put("PONE", "vmip000");
posMap.put("suicídate", "vmm0000");
posMap.put("tardar", "vmn0000");
posMap.put("seiscientas", "z0");
posMap.put("trescientas", "z0");
posMap.put("cc", "zu");
posMap.put("km", "zu");
posMap.put("kms", "zu");
}
private static int nUnknownWordTypes = posMap.size();
private static final Pattern digit = Pattern.compile("\\d+");
private static final Pattern participle = Pattern.compile("[ai]d[oa]$");
/**
* Names which would be mistakenly marked as function words by
* unigram tagger (and which never appear as function words in
* multi-word tokens)
*/
private static final Set<String> actuallyNames = new HashSet<>(Arrays.asList(
"Avenida",
"Contra",
"Gracias", // interjection
"in", // preposition; only appears in corpus as "in extremis" (preposition)
"Mercado",
"Jesús", // interjection
"Salvo",
"Van" // verb
));
// Name-looking word that isn't "Al"
private static final Pattern otherNamePattern = Pattern.compile("\\b(Al\\w+|A[^l]\\w*|[B-Z]\\w+)");
// Name-looking word that isn't "A"
private static final Pattern otherNamePattern2 = Pattern.compile("\\b(A\\w+|[B-Z]\\w+)");
// Determiners which may also appear as pronouns
private static final Pattern pPronounDeterminers = Pattern.compile("(tod|otr|un)[oa]s?");
public static String getOverrideTag(String word, String containingPhrase) {
if (containingPhrase == null)
return null;
if (word.equalsIgnoreCase("este") && !containingPhrase.startsWith(word))
return "np00000";
else if (word.equals("contra")
&& (containingPhrase.startsWith("en contra") || containingPhrase.startsWith("En contra")))
return "nc0s000";
else if (word.equals("total") && containingPhrase.startsWith("ese"))
return "nc0s000";
else if (word.equals("DEL"))
// Uses of "Del" in corpus are proper nouns, but uses of "DEL" are
// prepositions.. convenient for our purposes
return "sp000";
else if (word.equals("sí") && containingPhrase.contains("por sí")
|| containingPhrase.contains("fuera de sí"))
return "pp000000";
else if (pPronounDeterminers.matcher(word).matches() && containingPhrase.endsWith(word))
// Determiners tailing a phrase are pronouns: "sobre todo," "al otro", etc.
return "pi000000";
else if (word.equals("cuando") && containingPhrase.endsWith(word))
return "pi000000";
else if ((word.equalsIgnoreCase("contra") && containingPhrase.endsWith(word)))
return "nc0s000";
else if (word.equals("salvo") && containingPhrase.endsWith("salvo"))
return "aq0000";
else if (word.equals("mira") && containingPhrase.endsWith(word))
return "nc0s000";
else if (word.equals("pro") && containingPhrase.startsWith("en pro"))
return "nc0s000";
else if (word.equals("espera") && containingPhrase.endsWith("espera de"))
return "nc0s000";
else if (word.equals("Paso") && containingPhrase.equals("El Paso"))
return "np00000";
else if (word.equals("medio") && (containingPhrase.endsWith("medio de") || containingPhrase.endsWith("ambiente")
|| containingPhrase.endsWith("por medio") || containingPhrase.contains("por medio")
|| containingPhrase.endsWith("medio")))
return "nc0s000";
else if (word.equals("Medio") && containingPhrase.contains("Ambiente"))
return "nc0s000";
else if (word.equals("Medio") && containingPhrase.equals("Oriente Medio"))
return "aq0000";
else if (word.equals("media") && containingPhrase.equals("mass media"))
return "nc0n000";
else if (word.equals("cuenta")) // tomar en cuenta, darse cuenta de, ...
return "nc0s000";
else if (word.equals("h") && containingPhrase.startsWith("km"))
return "zu";
else if (word.equals("A") && (containingPhrase.contains("-") || containingPhrase.contains(",")
|| otherNamePattern2.matcher(containingPhrase).find() || containingPhrase.equals("terminal A")))
return "np00000";
else if (word.equals("forma") && containingPhrase.startsWith("forma parte"))
return "vmip000";
else if (word.equals("Sin") && containingPhrase.contains("Jaime"))
return "np00000";
else if (word.equals("di") && containingPhrase.contains("di cuenta"))
return "vmis000";
else if (word.equals("demos") && containingPhrase.contains("demos cuenta"))
return "vmsp000";
else if ((word.equals("van") || word.equals("den")) && containingPhrase.contains("van den"))
return "np00000";
if (word.equals("Al")) {
// "Al" is sometimes a part of name phrases: Arabic names, Al Gore, etc.
// Mark it a noun if its containing phrase has some other capitalized word
if (otherNamePattern.matcher(containingPhrase).find())
return "np00000";
else
return "sp000";
}
if (actuallyNames.contains(word))
return "np00000";
if (word.equals("sino") && containingPhrase.endsWith(word))
return "nc0s000";
else if (word.equals("mañana") || word.equals("paso") || word.equals("monta") || word.equals("deriva")
|| word.equals("visto"))
return "nc0s000";
else if (word.equals("frente") && containingPhrase.startsWith("al frente"))
return "nc0s000";
return null;
}
/**
* Match phrases for which unknown words should be assumed to be
* common nouns
*
* - a trancas y barrancas
* - en vez de, en pos de
* - sin embargo
* - merced a
* - pese a que
*/
private static final Pattern commonPattern =
Pattern.compile("^al? |^en .+ de$|sin | al?$| que$",
Pattern.CASE_INSENSITIVE);
public static String getTag(String word, String containingPhrase) {
// Exact matches
if (word.equals("%"))
return "ft";
else if (word.equals("+"))
return "fz";
else if (word.equals("&") || word.equals("@"))
return "f0";
if(digit.matcher(word).find())
return "z0";
else if (posMap.containsKey(word))
return posMap.get(word);
// Fallbacks
if (participle.matcher(word).find())
return "aq0000";
// One last hint: is the phrase one which we have designated to
// contain mostly common nouns?
if (commonPattern.matcher(word).matches())
return "ncms000";
// Now make an educated guess.
//log.info("No POS tag for " + word);
return "np00000";
}
}
/**
* Source training data for a unigram tagger from the given tree.
*/
public static void updateTagger(TwoDimensionalCounter<String,String> tagger,
Tree t) {
List<CoreLabel> yield = t.taggedLabeledYield();
for (CoreLabel cl : yield) {
if (cl.tag().equals(SpanishTreeNormalizer.MW_TAG))
continue;
tagger.incrementCount(cl.word(), cl.tag());
}
}
public static void traverseAndFix(Tree t,
Tree parent,
TwoDimensionalCounter<String, String> unigramTagger,
boolean retainNER) {
if(t.isPreTerminal()) {
if(t.value().equals(SpanishTreeNormalizer.MW_TAG)) {
nMissingPOS++;
String pos = inferPOS(t, parent, unigramTagger);
if (pos != null) {
t.setValue(pos);
nFixedPOS++;
}
}
return;
}
for(Tree kid : t.children())
traverseAndFix(kid, t, unigramTagger, retainNER);
// Post-order visit
if(t.value().startsWith(SpanishTreeNormalizer.MW_PHRASE_TAG)) {
nMissingPhrasal++;
String phrasalCat = inferPhrasalCategory(t, retainNER);
if (phrasalCat != null) {
t.setValue(phrasalCat);
nFixedPhrasal++;
}
}
}
/**
* Get a string representation of the immediate phrase which contains the given node.
*/
private static String getContainingPhrase(Tree t, Tree parent) {
if (parent == null)
return null;
List<Label> phraseYield = parent.yield();
StringBuilder containingPhrase = new StringBuilder();
for (Label l : phraseYield)
containingPhrase.append(l.value()).append(" ");
return containingPhrase.toString().substring(0, containingPhrase.length() - 1);
}
private static final SpanishVerbStripper verbStripper = SpanishVerbStripper.getInstance();
/**
* Attempt to infer the part of speech of the given preterminal node, which
* was created during the expansion of a multi-word token.
*/
private static String inferPOS(Tree t, Tree parent,
TwoDimensionalCounter<String, String> unigramTagger) {
String word = t.firstChild().value();
String containingPhraseStr = getContainingPhrase(t, parent);
// Overrides: let the manual POS model handle a few special cases first
String overrideTag = ManualUWModel.getOverrideTag(word, containingPhraseStr);
if (overrideTag != null)
return overrideTag;
Set<String> unigramTaggerKeys = unigramTagger.firstKeySet();
// Try treating this word as a verb and stripping any clitic
// pronouns. If the stripped version exists in the unigram
// tagger, then stick with the verb hypothesis
SpanishVerbStripper.StrippedVerb strippedVerb = verbStripper.separatePronouns(word);
if (strippedVerb != null && unigramTaggerKeys.contains(strippedVerb.getStem())) {
String pos = Counters.argmax(unigramTagger.getCounter(strippedVerb.getStem()));
if (pos.startsWith("v"))
return pos;
}
if (unigramTagger.firstKeySet().contains(word))
return Counters.argmax(unigramTagger.getCounter(word), new POSTieBreaker());
return ManualUWModel.getTag(word, containingPhraseStr);
}
/**
* Resolves "ties" between candidate part-of-speech tags encountered by the unigram tagger.
*/
private static class POSTieBreaker implements Comparator<String> {
@Override
public int compare(String o1, String o2) {
boolean firstIsNoun = o1.startsWith("n");
boolean secondIsNoun = o2.startsWith("n");
// Prefer nouns over everything
if (firstIsNoun && !secondIsNoun)
return -1;
else if (secondIsNoun && !firstIsNoun)
return 1;
// No other policies at the moment
return 0;
}
}
/**
* Attempt to infer the phrasal category of the given node, which
* heads words which were expanded from a multi-word token.
*/
private static String inferPhrasalCategory(Tree t, boolean retainNER) {
String phraseValue = t.value();
// Retrieve the part-of-speech assigned to the original multi-word
// token
String originalPos = phraseValue.substring(phraseValue.lastIndexOf('_') + 1);
if (phrasalCategoryMap.containsKey(originalPos)) {
return phrasalCategoryMap.get(originalPos);
} else if (originalPos.length() > 0 && originalPos.charAt(0) == 'n') {
// TODO may lead to some funky trees if a child somehow gets an
// incorrect tag -- e.g. we may have a `grup.nom` head a `vmis000`
if (!retainNER)
return "grup.nom";
char nerTag = phraseValue.charAt(phraseValue.length() - 1);
switch (nerTag) {
case 'l':
return "grup.nom.lug";
case 'o':
return "grup.nom.org";
case 'p':
return "grup.nom.pers";
case '0':
return "grup.nom.otros";
default:
return "grup.nom";
}
}
// Fallback: try to infer based on part-of-speech sequence formed by
// constituents
StringBuilder sb = new StringBuilder();
for(Tree kid : t.children())
sb.append(kid.value()).append(" ");
String posSequence = sb.toString().trim();
log.info("No phrasal cat for: " + posSequence + " (original POS of MWE: " + originalPos + ")");
// Give up.
return null;
}
private static void resolveDummyTags(File treeFile,
TwoDimensionalCounter<String, String> unigramTagger,
boolean retainNER, TreeNormalizer tn) {
TreeFactory tf = new LabeledScoredTreeFactory();
MultiWordTreeExpander expander = new MultiWordTreeExpander();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")),false,"UTF-8"));
int nTrees = 0;
for(Tree t; (t = tr.readTree()) != null;nTrees++) {
traverseAndFix(t, null, unigramTagger, retainNER);
// Now "decompress" further the expanded trees formed by
// multiword token splitting
t = expander.expandPhrases(t, tn, tf);
if (tn != null)
t = tn.normalizeWholeTree(t, tf);
pw.println(t.toString());
}
pw.close();
tr.close();
System.out.println("Processed " +nTrees+ " trees");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private static String usage() {
StringBuilder sb = new StringBuilder();
String nl = System.getProperty("line.separator");
sb.append(String.format("Usage: java %s [OPTIONS] treebank-file%n",
MultiWordPreprocessor.class.getName()));
sb.append("Options:").append(nl);
sb.append(" -help: Print this message").append(nl);
sb.append(" -ner: Retain NER information in tree constituents (pre-pre-terminal nodes)").append(nl);
sb.append(" -normalize {true, false}: Run the Spanish tree normalizer (non-aggressive) on the output of the main routine (true by default)").append(nl);
return sb.toString();
}
private static Map<String, Integer> argOptionDefs;
static {
argOptionDefs = Generics.newHashMap();
argOptionDefs.put("help", 0);
argOptionDefs.put("ner", 0);
argOptionDefs.put("normalize", 1);
}
/**
*
* @param args
*/
public static void main(String[] args) {
Properties options = StringUtils.argsToProperties(args, argOptionDefs);
if(!options.containsKey("") || options.containsKey("help")) {
log.info(usage());
return;
}
boolean retainNER = PropertiesUtils.getBool(options, "ner", false);
boolean normalize = PropertiesUtils.getBool(options, "normalize", true);
final File treeFile = new File(options.getProperty(""));
TwoDimensionalCounter<String,String> labelTerm =
new TwoDimensionalCounter<>();
TwoDimensionalCounter<String,String> termLabel =
new TwoDimensionalCounter<>();
TwoDimensionalCounter<String,String> labelPreterm =
new TwoDimensionalCounter<>();
TwoDimensionalCounter<String,String> pretermLabel =
new TwoDimensionalCounter<>();
TwoDimensionalCounter<String,String> unigramTagger =
new TwoDimensionalCounter<>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
for(Tree t; (t = tr.readTree()) != null;) {
updateTagger(unigramTagger, t);
}
tr.close(); //Closes the underlying reader
System.out.println("Resolving DUMMY tags");
resolveDummyTags(treeFile, unigramTagger, retainNER,
normalize ? new SpanishTreeNormalizer(true, false, false) : null);
System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
System.out.println(String.format("#Missing POS: %d (fixed: %d, %.2f%%)",
nMissingPOS, nFixedPOS,
(double) nFixedPOS / nMissingPOS * 100));
System.out.println(String.format("#Missing Phrasal: %d (fixed: %d, %.2f%%)",
nMissingPhrasal, nFixedPhrasal,
(double) nFixedPhrasal / nMissingPhrasal * 100));
System.out.println("Done!");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}