package de.berlin.hu.banner.featuresets; import banner.tagging.FeatureSet; import banner.tagging.TagFormat; import banner.tagging.pipe.*; import banner.types.Mention.MentionType; import banner.types.Sentence.OverlapOption; import cc.mallet.pipe.Pipe; import cc.mallet.pipe.SerialPipes; import cc.mallet.pipe.TokenSequence2FeatureVectorSequence; import cc.mallet.pipe.tsf.OffsetConjunctions; import cc.mallet.pipe.tsf.RegexMatches; import cc.mallet.pipe.tsf.TokenTextCharPrefix; import cc.mallet.pipe.tsf.TokenTextCharSuffix; import dragon.nlp.tool.Lemmatiser; import dragon.nlp.tool.Tagger; import java.util.ArrayList; import java.util.Set; import java.util.regex.Pattern; /** * @author trocktae * * Feature set akin to Klinger et al. (2008) * */ public class KlingerLikeFeatureSet extends FeatureSet { public KlingerLikeFeatureSet(TagFormat format, Lemmatiser lemmatiser, Tagger posTagger, banner.tagging.Tagger preTagger, Set<MentionType> mentionTypes, OverlapOption sameType, OverlapOption differentType) { super(format, lemmatiser, posTagger, preTagger, mentionTypes, sameType, differentType); this.pipe = createPipe(format, lemmatiser, posTagger, preTagger, mentionTypes, sameType, differentType); } private static final long serialVersionUID = 3850553083981024255L; private SerialPipes pipe; /** * hardcoded switch to turn POS and LEMMA feature on/off */ private static final boolean USE_POS_AND_LEMMA = false; @Override public void setLemmatiser(Lemmatiser lemmatiser) { if (USE_POS_AND_LEMMA) { ((LemmaPOS) pipe.getPipe(1)).setLemmatiser(lemmatiser); } } @Override public void setPosTagger(dragon.nlp.tool.Tagger posTagger) { if (USE_POS_AND_LEMMA) { ((LemmaPOS) pipe.getPipe(1)).setPosTagger(posTagger); } } @Override public void setPreTagger(banner.tagging.Tagger preTagger) { if (USE_POS_AND_LEMMA) { ((Pretagger) pipe.getPipe(2)).setPreTagger(preTagger); } } @Override public Pipe getPipe() { return this.pipe; } private SerialPipes createPipe(TagFormat format, Lemmatiser lemmatiser, dragon.nlp.tool.Tagger posTagger, banner.tagging.Tagger preTagger, Set<MentionType> mentionTypes, OverlapOption sameType, OverlapOption differentType) { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new Sentence2TokenSequence(format, mentionTypes, sameType, differentType)); //Whitespace pipes.add(new LChar("LCHAR=")); pipes.add(new RChar("RCHAR=")); //Bag of words pipes.add(new LowerCaseTokenText("W=")); //All Caps pipes.add(new RegexMatches("ALLCAPS", Pattern.compile("[A-Z]+"))); //Real Number pipes.add(new RegexMatches("REALNUMBER", Pattern.compile("[-0-9]+[.,]+[0-9.,]+"))); //Is Dash pipes.add(new RegexMatches("ISDASH", Pattern.compile("[-–—−]"))); //Is Quote pipes.add(new RegexMatches("ISQUOTE", Pattern.compile("[„“””‘’\"']"))); //Is Slash pipes.add(new RegexMatches("ISSLASH", Pattern.compile("[/\\\\]"))); //Prefixes and Suffixes pipes.add(new TokenTextCharPrefix("2PREFIX=", 2)); pipes.add(new TokenTextCharSuffix("2SUFFIX=", 2)); //Offset Conjunction of 2 pipes.add(new OffsetConjunctions(new int[][] { { -2 }, { -1 }, { 1 }, { 2 } })); pipes.add(new TokenSequence2FeatureVectorSequence(true, true)); return new SerialPipes(pipes); } }