package edu.stanford.nlp.parser.lexparser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType;
import edu.stanford.nlp.international.morph.MorphoFeatures;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.process.SerializableFunction;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.MemoryTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.trees.international.french.DybroFrenchHeadFinder;
import edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory;
import edu.stanford.nlp.trees.international.french.FrenchXMLTreeReaderFactory;
import edu.stanford.nlp.trees.international.french.FrenchTreebankLanguagePack;
import edu.stanford.nlp.trees.tregex.TregexParseException;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.TregexPatternCompiler;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.Pair;
/**
* TreebankLangParserParams for the French Treebank corpus. This package assumes that the FTB
* has been transformed into PTB-format trees encoded in UTF-8. The "-xmlFormat" option can
* be used to read the raw FTB trees.
*
* @author Marie-Catherine de Marneffe
* @author Spence Green
*
*/
public class FrenchTreebankParserParams extends AbstractTreebankParserParams {
private static final long serialVersionUID = -6976724734594763986L;
private final StringBuilder optionsString;
private HeadFinder headFinder;
private final Map<String,Pair<TregexPattern,Function<TregexMatcher,String>>> annotationPatterns;
private final List<Pair<TregexPattern,Function<TregexMatcher,String>>> activeAnnotations;
//The treebank is distributed in XML format.
//Use -xmlFormat below to enable reading the raw files.
private boolean readPennFormat = true;
private boolean collinizerRetainsPunctuation = false;
//Controls the MW annotation feature
private TwoDimensionalCounter<String, String> mwCounter;
private MorphoFeatureSpecification morphoSpec;
// For adding the CC tagset as annotations.
private MorphoFeatureSpecification tagSpec;
public FrenchTreebankParserParams() {
super(new FrenchTreebankLanguagePack());
setInputEncoding("UTF-8");
optionsString = new StringBuilder();
optionsString.append("FrenchTreebankParserParams\n");
annotationPatterns = Generics.newHashMap();
activeAnnotations = new ArrayList<Pair<TregexPattern,Function<TregexMatcher,String>>>();
initializeAnnotationPatterns();
}
private final List<String> baselineFeatures = new ArrayList<String>();
{
baselineFeatures.add("-tagPAFr");
baselineFeatures.add("-markInf");
baselineFeatures.add("-markPart");
baselineFeatures.add("-markVN");
baselineFeatures.add("-coord1");
baselineFeatures.add("-de2");
baselineFeatures.add("-markP1");
//MWE features...don't help overall parsing, but help MWE categories
baselineFeatures.add("-MWAdvS");
baselineFeatures.add("-MWADVSel1");
baselineFeatures.add("-MWADVSel2");
baselineFeatures.add("-MWNSel1");
baselineFeatures.add("-MWNSel2");
// New features for CL submission
baselineFeatures.add("-splitPUNC");
}
private final List<String> additionalFeatures = new ArrayList<String>();
private void initializeAnnotationPatterns() {
try {
TregexPatternCompiler tregexPatternCompiler = new TregexPatternCompiler(headFinder());
/***************************************************************************
* BASELINE FEATURES
***************************************************************************/
// Incremental delta improvements are over the previous feature (dev set, <= 40)
//
// POS Splitting for verbs
annotationPatterns.put("-markInf",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@V > (@VN > @VPinf)"),new SimpleStringFunction("-infinitive")));
annotationPatterns.put("-markPart",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@V > (@VN > @VPpart)"),new SimpleStringFunction("-participle")));
annotationPatterns.put("-markVN",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << @VN"),new SimpleStringFunction("-withVN")));
// +1.45 F1 (Helps MWEs significantly)
annotationPatterns.put("-tagPAFr", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("!@PUNC < (__ !< __) > __=parent"),new AddRelativeNodeFunction("-","parent", true)));
// +.14 F1
annotationPatterns.put("-coord1",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@COORD <2 __=word"), new AddRelativeNodeFunction("-","word", true)));
// +.70 F1 -- de c-commands other stuff dominated by NP, PP, and COORD
annotationPatterns.put("-de2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P < /^([Dd]es?|du|d')$/"),new SimpleStringFunction("-de2")));
annotationPatterns.put("-de3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP|PP|COORD >+(@NP|PP) (@PP <, (@P < /^([Dd]es?|du|d')$/))"),new SimpleStringFunction("-de3")));
// +.31 F1
annotationPatterns.put("-markP1",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > (@PP > @NP)"),new SimpleStringFunction("-n")));
//MWEs
//(for MWADV 75.92 -> 77.16)
annotationPatterns.put("-MWAdvS", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWADV > /S/"),new SimpleStringFunction("-mwadv-s")));
annotationPatterns.put("-MWADVSel1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWADV <1 @P <2 @N !<3 __"),new SimpleStringFunction("-mwadv1")));
annotationPatterns.put("-MWADVSel2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWADV <1 @P <2 @D <3 @N !<4 __"),new SimpleStringFunction("-mwadv2")));
annotationPatterns.put("-MWNSel1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @N <2 @A !<3 __"),new SimpleStringFunction("-mwn1")));
annotationPatterns.put("-MWNSel2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @N <2 @P <3 @N !<4 __"),new SimpleStringFunction("-mwn2")));
annotationPatterns.put("-MWNSel3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @N <2 @- <3 @N !<4 __"),new SimpleStringFunction("-mwn3")));
annotationPatterns.put("-splitPUNC",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PUNC < __=" + AnnotatePunctuationFunction.key),new AnnotatePunctuationFunction()));
/***************************************************************************
* TEST FEATURES
***************************************************************************/
// Mark MWE tags only
annotationPatterns.put("-mweTag", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("!@PUNC < (__ !< __) > /MW/=parent"),new AddRelativeNodeFunction("-","parent", true)));
annotationPatterns.put("-sq",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@SENT << /\\?/"),new SimpleStringFunction("-Q")));
//New phrasal splits
annotationPatterns.put("-hasVP", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("!@ROOT|SENT << /^VP/"),new SimpleStringFunction("-hasVP")));
annotationPatterns.put("-hasVP2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << /^VP/"),new SimpleStringFunction("-hasVP")));
annotationPatterns.put("-npCOORD", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < @COORD"),new SimpleStringFunction("-coord")));
annotationPatterns.put("-npVP", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < /VP/"),new SimpleStringFunction("-vp")));
//NPs
annotationPatterns.put("-baseNP1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <1 @D <2 @N !<3 __"),new SimpleStringFunction("-np1")));
annotationPatterns.put("-baseNP2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <1 @D <2 @MWN !<3 __"),new SimpleStringFunction("-np2")));
annotationPatterns.put("-baseNP3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <1 @MWD <2 @N !<3 __ "),new SimpleStringFunction("-np3")));
//MWEs
annotationPatterns.put("-npMWN1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < (@MWN < @A)"),new SimpleStringFunction("-mwna")));
annotationPatterns.put("-npMWN2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <1 @D <2 @MWN <3 @PP !<4 __"),new SimpleStringFunction("-mwn2")));
annotationPatterns.put("-npMWN3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <1 @D <2 (@MWN <1 @N <2 @A !<3 __) !<3 __"),new SimpleStringFunction("-mwn3")));
annotationPatterns.put("-npMWN4", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP <, @P <2 (@NP <1 @D <2 (@MWN <1 @N <2 @A !<3 __) !<3 __) !<3 __"),new SimpleStringFunction("-mwn3")));
//The whopper....
annotationPatterns.put("-MWNSel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN"),new AddPOSSequenceFunction("-",600,true)));
annotationPatterns.put("-MWADVSel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWADV"),new AddPOSSequenceFunction("-",500,true)));
annotationPatterns.put("-MWASel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWA"),new AddPOSSequenceFunction("-",100,true)));
annotationPatterns.put("-MWCSel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWC"),new AddPOSSequenceFunction("-",400,true)));
annotationPatterns.put("-MWDSel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWD"),new AddPOSSequenceFunction("-",100,true)));
annotationPatterns.put("-MWPSel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWP"),new AddPOSSequenceFunction("-",600,true)));
annotationPatterns.put("-MWPROSel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWPRO"),new AddPOSSequenceFunction("-",60,true)));
annotationPatterns.put("-MWVSel", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWV"),new AddPOSSequenceFunction("-",200,true)));
//MWN
annotationPatterns.put("-mwn1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @N <2 @A !<3 __"),new SimpleStringFunction("-na")));
annotationPatterns.put("-mwn2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @N <2 @P <3 @N !<4 __"),new SimpleStringFunction("-npn")));
annotationPatterns.put("-mwn3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @N <2 @- <3 @N !<4 __"),new SimpleStringFunction("-n-n")));
annotationPatterns.put("-mwn4", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @N <2 @N !<3 __"),new SimpleStringFunction("-nn")));
annotationPatterns.put("-mwn5", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWN <1 @D <2 @N !<3 __"),new SimpleStringFunction("-dn")));
//wh words
annotationPatterns.put("-hasWH", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ < /^(qui|quoi|comment|quel|quelle|quels|quelles|où|combien|que|pourquoi|quand)$/"),new SimpleStringFunction("-wh")));
//POS splitting
annotationPatterns.put("-markNNP2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@N < /^[A-Z]/"),new SimpleStringFunction("-nnp")));
annotationPatterns.put("-markD1",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D > (__ > @PP)"),new SimpleStringFunction("-p")));
annotationPatterns.put("-markD2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D > (__ > @NP)"),new SimpleStringFunction("-n")));
annotationPatterns.put("-markD3",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D > (__ > /^VP/)"),new SimpleStringFunction("-v")));
annotationPatterns.put("-markD4",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D > (__ > /^S/)"),new SimpleStringFunction("-s")));
annotationPatterns.put("-markD5",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D > (__ > @COORD)"),new SimpleStringFunction("-c")));
//Appositives?
annotationPatterns.put("-app1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < /[,]/"),new SimpleStringFunction("-app1")));
annotationPatterns.put("-app2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("/[^,\\-:;\"]/ > (@NP < /^[,]$/) $,, /^[,]$/"),new SimpleStringFunction("-app2")));
//COORD
annotationPatterns.put("-coord2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@COORD !< @C"), new SimpleStringFunction("-nonC")));
annotationPatterns.put("-hasCOORD",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ < @COORD"), new SimpleStringFunction("-hasCOORD")));
annotationPatterns.put("-hasCOORDLS",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@SENT <, @COORD"), new SimpleStringFunction("-hasCOORDLS")));
annotationPatterns.put("-hasCOORDNonS",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ < @COORD !<, @COORD"), new SimpleStringFunction("-hasCOORDNonS")));
// PP / VPInf
annotationPatterns.put("-pp1",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P < /^(du|des|au|aux)$/=word"), new AddRelativeNodeFunction("-","word", false)));
annotationPatterns.put("-vpinf1",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@VPinf <, __=word"), new AddRelativeNodeFunction("-","word", false)));
annotationPatterns.put("-vpinf2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@VPinf <, __=word"), new AddRelativeNodeFunction("-","word", true)));
// PP splitting (subsumed by the de2-3 features)
annotationPatterns.put("-splitIN",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP <, (P < /^([Dd]e|[Dd]'|[Dd]es|[Dd]u|à|[Aa]u|[Aa]ux|[Ee]n|[Dd]ans|[Pp]ar|[Ss]ur|[Pp]our|[Aa]vec|[Ee]ntre)$/=word)"), new AddRelativeNodeFunction("-","word", false,true)));
annotationPatterns.put("-splitP",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P < /^([Dd]e|[Dd]'|[Dd]es|[Dd]u|à|[Aa]u|[Aa]ux|[Ee]n|[Dd]ans|[Pp]ar|[Ss]ur|[Pp]our|[Aa]vec|[Ee]ntre)$/=word"), new AddRelativeNodeFunction("-","word", false,true)));
//de features
annotationPatterns.put("-hasde", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP|PP <+(@NP|PP) (P < de)"),new SimpleStringFunction("-hasDE")));
annotationPatterns.put("-hasde2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP < de"),new SimpleStringFunction("-hasDE2")));
//NPs
annotationPatterns.put("-np1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < /^,$/"),new SimpleStringFunction("-np1")));
annotationPatterns.put("-np2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <, (@D < le|la|les)"),new SimpleStringFunction("-np2")));
annotationPatterns.put("-np3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D < le|la|les"),new SimpleStringFunction("-def")));
annotationPatterns.put("-baseNP", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <, @D <- (@N , @D)"),new SimpleStringFunction("-baseNP")));
// PP environment
annotationPatterns.put("-markP2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > (@PP > @AP)"),new SimpleStringFunction("-a")));
annotationPatterns.put("-markP3",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > (@PP > @SENT|Ssub|VPinf|VPpart)"),new SimpleStringFunction("-v")));
annotationPatterns.put("-markP4",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > (@PP > @Srel)"),new SimpleStringFunction("-r")));
annotationPatterns.put("-markP5",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > (@PP > @COORD)"),new SimpleStringFunction("-c")));
annotationPatterns.put("-markP6",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > @VPinf"),new SimpleStringFunction("-b")));
annotationPatterns.put("-markP7",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > @VPpart"),new SimpleStringFunction("-b")));
annotationPatterns.put("-markP8",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > /^MW|NP/"),new SimpleStringFunction("-internal")));
annotationPatterns.put("-markP9",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@P > @COORD"),new SimpleStringFunction("-c")));
/***************************************************************************
* DIDN'T WORK
***************************************************************************/
//MWEs
annotationPatterns.put("-hasMWP", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("!/S/ < @MWP"),new SimpleStringFunction("-mwp")));
annotationPatterns.put("-hasMWP2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP < @MWP"),new SimpleStringFunction("-mwp2")));
annotationPatterns.put("-hasMWN2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP <+(@NP) @MWN"),new SimpleStringFunction("-hasMWN2")));
annotationPatterns.put("-hasMWN3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < @MWN"),new SimpleStringFunction("-hasMWN3")));
annotationPatterns.put("-hasMWADV", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("/^A/ < @MWADV"),new SimpleStringFunction("-hasmwadv")));
annotationPatterns.put("-hasC1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ < @MWC"),new SimpleStringFunction("-hasc1")));
annotationPatterns.put("-hasC2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@MWC > /S/"),new SimpleStringFunction("-hasc2")));
annotationPatterns.put("-hasC3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@COORD < @MWC"),new SimpleStringFunction("-hasc3")));
annotationPatterns.put("-uMWN", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP <: @MWN"),new SimpleStringFunction("-umwn")));
//POS splitting
annotationPatterns.put("-splitC", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@C < __=word"),new AddRelativeNodeFunction("-","word", false)));
annotationPatterns.put("-splitD",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D < /^[^\\d+]{1,4}$/=word"), new AddRelativeNodeFunction("-","word", false)));
annotationPatterns.put("-de1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@D < /^([Dd]es?|du|d')$/"),new SimpleStringFunction("-de1")));
annotationPatterns.put("-markNNP1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < (N < /^[A-Z]/) !< /^[^NA]/"),new SimpleStringFunction("-nnp")));
//PP environment
annotationPatterns.put("-markPP1",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP > @NP"),new SimpleStringFunction("-n")));
annotationPatterns.put("-markPP2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP > @AP"),new SimpleStringFunction("-a")));
annotationPatterns.put("-markPP3",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP > @SENT|Ssub|VPinf|VPpart"),new SimpleStringFunction("-v")));
annotationPatterns.put("-markPP4",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP > @Srel"),new SimpleStringFunction("-r")));
annotationPatterns.put("-markPP5",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@PP > @COORD"),new SimpleStringFunction("-c")));
annotationPatterns.put("-dominateCC",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << @COORD"),new SimpleStringFunction("-withCC")));
annotationPatterns.put("-dominateIN",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << @PP"),new SimpleStringFunction("-withPP")));
//Klein and Manning style features
annotationPatterns.put("-markContainsVP", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << /^VP/"),new SimpleStringFunction("-hasV")));
annotationPatterns.put("-markContainsVP2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << /^VP/=word"), new AddRelativeNodeFunction("-hasV-","word", false)));
annotationPatterns.put("-markVNArgs",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@VN $+ __=word1"), new AddRelativeNodeFunction("-","word1", false)));
annotationPatterns.put("-markVNArgs2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@VN > __=word1 $+ __=word2"), new AddRelativeNodeFunction("-","word1","word2", false)));
annotationPatterns.put("-markContainsMW", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << /^MW/"),new SimpleStringFunction("-hasMW")));
annotationPatterns.put("-markContainsMW2",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ << /^MW/=word"), new AddRelativeNodeFunction("-has-","word", false)));
//MWE Sequence features
annotationPatterns.put("-mwStart", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ >, /^MW/"),new SimpleStringFunction("-mwStart")));
annotationPatterns.put("-mwMiddle", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ !>- /^MW/ !>, /^MW/ > /^MW/"),new SimpleStringFunction("-mwMid")));
annotationPatterns.put("-mwMiddle2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ !>- /^MW/ !>, /^MW/ > /^MW/ , __=pos"),new AddRelativeNodeFunction("-","pos", true)));
annotationPatterns.put("-mwEnd", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("__ >- /^MW/"),new SimpleStringFunction("-mwEnd")));
//AP Features
annotationPatterns.put("-nonNAP",new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@AP !$, @N|AP"), new SimpleStringFunction("-nap")));
//Phrasal splitting
annotationPatterns.put("-markNPTMP", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < (@N < /^(lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche|Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche|janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|Janvier|Février|Mars|Avril|Mai|Juin|Juillet|Août|Septembre|Octobre|Novembre|Décembre)$/)"),new SimpleStringFunction("-tmp")));
//Singular
annotationPatterns.put("-markSing1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < (D < /^(ce|cette|une|la|le|un|sa|son|ma|mon|ta|ton)$/)"),new SimpleStringFunction("-sing")));
annotationPatterns.put("-markSing2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@AP < (A < (/[^sx]$/ !< __))"),new SimpleStringFunction("-sing")));
annotationPatterns.put("-markSing3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@VPpart < (V < /(e|é)$/)"),new SimpleStringFunction("-sing")));
//Plural
annotationPatterns.put("-markPl1", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@NP < (D < /s$/)"),new SimpleStringFunction("-pl")));
annotationPatterns.put("-markPl2", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@AP < (A < /[sx]$/)"),new SimpleStringFunction("-pl")));
annotationPatterns.put("-markPl3", new Pair<TregexPattern,Function<TregexMatcher,String>>(tregexPatternCompiler.compile("@VPpart < (V < /(es|és)$/)"),new SimpleStringFunction("-pl")));
} catch (TregexParseException e) {
int nth = annotationPatterns.size() + 1;
String nthStr = (nth == 1) ? "1st": ((nth == 2) ? "2nd": nth + "th");
System.err.println("Parse exception on " + nthStr + " annotation pattern initialization:" + e);
}
}
private static class AnnotatePunctuationFunction implements SerializableFunction<TregexMatcher,String> {
static final String key = "term";
public String apply(TregexMatcher m) {
final String punc = m.getNode(key).value();
if (punc.equals("."))
return "-fs";
else if (punc.equals("?"))
return "-quest";
else if (punc.equals(","))
return "-comma";
else if (punc.equals(":") || punc.equals(";"))
return "-colon";
// else if (punc.equals("-LRB-"))
// return "-lrb";
// else if (punc.equals("-RRB-"))
// return "-rrb";
// else if (punc.equals("-"))
// return "-dash";
// else if (quote.matcher(punc).matches())
// return "-quote";
// else if(punc.equals("/"))
// return "-slash";
// else if(punc.equals("%"))
// return "-perc";
// else if(punc.contains(".."))
// return "-ellipses";
return "";
}
@Override
public String toString() { return "AnnotatePunctuationFunction"; }
private static final long serialVersionUID = 1L;
}
/**
* Annotates all nodes that match the tregex query with annotationMark.
*
*/
private static class SimpleStringFunction implements SerializableFunction<TregexMatcher,String> {
private String annotationMark;
public SimpleStringFunction(String annotationMark) {
this.annotationMark = annotationMark;
}
public String apply(TregexMatcher tregexMatcher) {
return annotationMark;
}
@Override
public String toString() { return "SimpleStringFunction[" + annotationMark + ']'; }
private static final long serialVersionUID = 1L;
}
/**
* Annotates all nodes that match the tregex query with annotationMark + key1
* Usually annotationMark = "-"
* Optionally, you can use a second key in the tregex expression.
*
*/
private class AddRelativeNodeFunction implements SerializableFunction<TregexMatcher,String> {
private String annotationMark;
private String key;
private String key2;
private boolean doBasicCat = false;
private boolean toLower = false;
public AddRelativeNodeFunction(String annotationMark, String key, boolean basicCategory) {
this.annotationMark = annotationMark;
this.key = key;
this.key2 = null;
doBasicCat = basicCategory;
}
public AddRelativeNodeFunction(String annotationMark, String key1, String key2, boolean basicCategory) {
this(annotationMark,key1,basicCategory);
this.key2 = key2;
}
public AddRelativeNodeFunction(String annotationMark, String key1, boolean basicCategory, boolean toLower) {
this(annotationMark,key1,basicCategory);
this.toLower = toLower;
}
public String apply(TregexMatcher m) {
String tag;
if(key2 == null)
tag = annotationMark + ((doBasicCat) ? tlp.basicCategory(m.getNode(key).label().value()) : m.getNode(key).label().value());
else {
String annot1 = (doBasicCat) ? tlp.basicCategory(m.getNode(key).label().value()) : m.getNode(key).label().value();
String annot2 = (doBasicCat) ? tlp.basicCategory(m.getNode(key2).label().value()) : m.getNode(key2).label().value();
tag = annotationMark + annot1 + annotationMark + annot2;
}
return (toLower) ? tag.toLowerCase() : tag;
}
@Override
public String toString() {
if(key2 == null)
return "AddRelativeNodeFunction[" + annotationMark + ',' + key + ']';
else
return "AddRelativeNodeFunction[" + annotationMark + ',' + key + ',' + key2 + ']';
}
private static final long serialVersionUID = 1L;
}
private class AddPOSSequenceFunction implements SerializableFunction<TregexMatcher,String> {
private final String annotationMark;
private final boolean doBasicCat;
private final double cutoff;
public AddPOSSequenceFunction(String annotationMark, int cutoff, boolean basicCategory) {
this.annotationMark = annotationMark;
doBasicCat = basicCategory;
this.cutoff = cutoff;
}
public String apply(TregexMatcher m) {
if(mwCounter == null)
throw new RuntimeException("Cannot enable POSSequence features without POS sequence map. Use option -frenchMWMap.");
Tree t = m.getMatch();
StringBuilder sb = new StringBuilder();
for(Tree kid : t.children()) {
if( ! kid.isPreTerminal())
throw new RuntimeException("Not POS sequence for tree: " + t.toString());
String tag = doBasicCat ? tlp.basicCategory(kid.value()) : kid.value();
sb.append(tag).append(" ");
}
if(mwCounter.getCount(t.value(), sb.toString().trim()) > cutoff)
return annotationMark + sb.toString().replaceAll("\\s+", "").toLowerCase();
else
return "";
}
@Override
public String toString() {
return "AddPOSSequenceFunction[" + annotationMark + ',' + cutoff + ',' + doBasicCat + ']';
}
private static final long serialVersionUID = 1L;
}
@Override
public HeadFinder headFinder() {
if(headFinder == null)
headFinder = new DybroFrenchHeadFinder(treebankLanguagePack()); //Superior for vanilla PCFG over Arun's headfinding rules
return headFinder;
}
@Override
public HeadFinder typedDependencyHeadFinder() {
return headFinder();
}
private void setHeadFinder(HeadFinder hf) {
if(hf == null)
throw new IllegalArgumentException();
headFinder = hf;
// Need to re-initialize all patterns due to the new headFinder
initializeAnnotationPatterns();
activeAnnotations.clear();
for(String key : baselineFeatures) {
Pair<TregexPattern,Function<TregexMatcher,String>> p = annotationPatterns.get(key);
activeAnnotations.add(p);
}
for(String key : additionalFeatures) {
Pair<TregexPattern,Function<TregexMatcher,String>> p = annotationPatterns.get(key);
activeAnnotations.add(p);
}
}
/**
*
* @param op Lexicon options
* @return A Lexicon
*/
@Override
public Lexicon lex(Options op, Index<String> wordIndex, Index<String> tagIndex) {
if(op.lexOptions.uwModelTrainer == null)
op.lexOptions.uwModelTrainer = "edu.stanford.nlp.parser.lexparser.FrenchUnknownWordModelTrainer";
if(morphoSpec != null) {
return new FactoredLexicon(op, morphoSpec, wordIndex, tagIndex);
}
return new BaseLexicon(op, wordIndex, tagIndex);
}
@Override
public String[] sisterSplitters() {
return new String[0];
}
@Override
public TreeTransformer collinizer() {
return new TreeCollinizer(treebankLanguagePack());
}
@Override
public TreeTransformer collinizerEvalb() {
return new TreeCollinizer(treebankLanguagePack(),collinizerRetainsPunctuation,false);
}
@Override
public DiskTreebank diskTreebank() {
return new DiskTreebank(treeReaderFactory(), inputEncoding);
}
@Override
public MemoryTreebank memoryTreebank() {
return new MemoryTreebank(treeReaderFactory(), inputEncoding);
}
public TreeReaderFactory treeReaderFactory() {
return (readPennFormat) ? new FrenchTreeReaderFactory() : new FrenchXMLTreeReaderFactory(false);
}
public List<HasWord> defaultTestSentence() {
String[] sent = {"Ceci", "est", "seulement", "un", "test", "."};
return Sentence.toWordList(sent);
}
@Override
public Tree transformTree(Tree t, Tree root) {
String baseCat = t.value();
StringBuilder newCategory = new StringBuilder();
//Add manual state splits
for (Pair<TregexPattern,Function<TregexMatcher,String>> e : activeAnnotations) {
TregexMatcher m = e.first().matcher(root);
if (m.matchesAt(t))
newCategory.append(e.second().apply(m));
}
//Add morphosyntactic features if this is a POS tag
if(t.isPreTerminal() && tagSpec != null) {
if( !(t.firstChild().label() instanceof CoreLabel) || ((CoreLabel) t.firstChild().label()).originalText() == null )
throw new RuntimeException(String.format("%s: Term lacks morpho analysis: %s",this.getClass().getName(),t.toString()));
String morphoStr = ((CoreLabel) t.firstChild().label()).originalText();
Pair<String,String> lemmaMorph = MorphoFeatureSpecification.splitMorphString("", morphoStr);
MorphoFeatures feats = tagSpec.strToFeatures(lemmaMorph.second());
baseCat = feats.getTag(baseCat);
}
//Update the label(s)
String newCat = baseCat + newCategory.toString();
t.setValue(newCat);
if (t.isPreTerminal() && t.label() instanceof HasTag)
((HasTag) t.label()).setTag(newCat);
return t;
}
private void loadMWMap(String filename) {
mwCounter = new TwoDimensionalCounter<String,String>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), "UTF-8"));
int nLines = 0;
for(String line; (line = br.readLine()) != null; nLines++) {
String[] toks = line.split("\t");
assert toks.length == 3;
mwCounter.setCount(toks[0].trim(), toks[1].trim(), Double.parseDouble(toks[2].trim()));
}
br.close();
System.err.printf("%s: Loaded %d lines from %s into MWE counter%n", this.getClass().getName(),nLines,filename);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Configures morpho-syntactic annotations for POS tags.
*
* @param activeFeats A comma-separated list of feature values with names according
* to MorphoFeatureType.
*
*/
private String setupMorphoFeatures(String activeFeats) {
String[] feats = activeFeats.split(",");
morphoSpec = tlp.morphFeatureSpec();
for(String feat : feats) {
MorphoFeatureType fType = MorphoFeatureType.valueOf(feat.trim());
morphoSpec.activate(fType);
}
return morphoSpec.toString();
}
private void removeBaselineFeature(String featName) {
if(baselineFeatures.contains(featName)) {
baselineFeatures.remove(featName);
Pair<TregexPattern,Function<TregexMatcher,String>> p = annotationPatterns.get(featName);
activeAnnotations.remove(p);
}
}
@Override
public void display() {
System.err.println(optionsString.toString());
}
@Override
public int setOptionFlag(String[] args, int i) {
if (annotationPatterns.keySet().contains(args[i])) {
if(!baselineFeatures.contains(args[i])) additionalFeatures.add(args[i]);
Pair<TregexPattern,Function<TregexMatcher,String>> p = annotationPatterns.get(args[i]);
activeAnnotations.add(p);
optionsString.append("Option " + args[i] + " added annotation pattern " + p.first() + " with annotation " + p.second() + '\n');
i++;
} else if (args[i].equals("-collinizerRetainsPunctuation")) {
optionsString.append("Collinizer retains punctuation.\n");
collinizerRetainsPunctuation = true;
i++;
} else if (args[i].equalsIgnoreCase("-headFinder") && (i + 1 < args.length)) {
try {
HeadFinder hf = (HeadFinder) Class.forName(args[i + 1]).newInstance();
setHeadFinder(hf);
optionsString.append("HeadFinder: " + args[i + 1] + "\n");
} catch (Exception e) {
System.err.println(e);
System.err.println(this.getClass().getName() + ": Could not load head finder " + args[i + 1]);
}
i += 2;
} else if(args[i].equals("-xmlFormat")) {
optionsString.append("Reading trees in XML format.\n");
readPennFormat = false;
setInputEncoding(tlp.getEncoding());
i++;
} else if (args[i].equals("-frenchFactored")) {
for(String annotation : baselineFeatures) {
String[] a = {annotation};
setOptionFlag(a,0);
}
i++;
} else if(args[i].equals("-frenchMWMap")) {
loadMWMap(args[i+1]);
i+=2;
} else if(args[i].equals("-tsg")) {
//wsg2011: These features should be removed for TSG extraction.
//If they are retained, the resulting grammar seems to be too brittle....
optionsString.append("Removing baseline features: ");
removeBaselineFeature("-markVN");
optionsString.append(" (removed -markVN)");
removeBaselineFeature("-coord1");
optionsString.append(" (removed -coord1)\n");
i++;
} else if(args[i].equals("-factlex") && (i + 1 < args.length)) {
String activeFeats = setupMorphoFeatures(args[i+1]);
optionsString.append("Factored Lexicon: active features: ").append(activeFeats);
// WSGDEBUG Maybe add -mweTag in place of -tagPAFr?
removeBaselineFeature("-tagPAFr");
optionsString.append(" (removed -tagPAFr)\n");
// Add -mweTag
String[] option = {"-mweTag"};
setOptionFlag(option, 0);
i+=2;
} else if(args[i].equals("-noFeatures")) {
activeAnnotations.clear();
optionsString.append("Removed all manual features.\n");
i++;
} else if(args[i].equals("-ccTagsetAnnotations")) {
tagSpec = new FrenchMorphoFeatureSpecification();
tagSpec.activate(MorphoFeatureType.OTHER);
optionsString.append("Adding CC tagset as POS state splits.\n");
++i;
}
return i;
}
}