// NERFeatureFactory -- features for a probabilistic Named Entity Recognizer
// Copyright (c) 2002-2008 Leland Stanford Junior University
// Additional features (c) 2003 The University of Edinburgh
//
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
// Christopher Manning
// Dept of Computer Science, Gates 1A
// Stanford CA 94305-9010
// USA
// Support/Questions: java-nlp-user@lists.stanford.edu
// Licensing: java-nlp-support@lists.stanford.edu
// http://nlp.stanford.edu/downloads/crf-classifier.shtml
package edu.stanford.nlp.ie;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.AbbrAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.AbgeneAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.AbstrAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChunkAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.DictAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.DistSimAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.DomainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.EntityRuleAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.EntityTypeAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.FreqAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.GazAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.GeniaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.GovernorAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.IsDateRangeAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.IsURLAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ParaPositionAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ProtoAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SectionAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ShapeAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.StackedNamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TopicAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.UnknownAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.WebAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.WordPositionAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.WordnetSynAnnotation;
import edu.stanford.nlp.ling.CoreLabel.GenericAnnotation;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
/**
* Features for Named Entity Recognition. The code here creates the features
* by processing Lists of CoreLabels.
* Look at {@link SeqClassifierFlags} to see where the flags are set for
* what options to use for what flags.
* <p>
* To add a new feature extractor, you should do the following:
* <ol>
* <li>Add a variable (boolean, int, String, etc. as appropriate) to
* SeqClassifierFlags to mark if the new extractor is turned on or
* its value, etc. Add it at the <i>bottom</i> of the list of variables
* currently in the class (this avoids problems with older serialized
* files breaking). Make the default value of the variable false/null/0
* (this is again for backwards compatibility).</li>
* <li>Add a clause to the big if/then/else of setProperties(Properties) in
* SeqClassifierFlags. Unless it is a macro option, make the option name
* the same as the variable name used in step 1.</li>
* <li>Add code to NERFeatureFactory for this feature. First decide which
* classes (hidden states) are involved in the feature. If only the
* current class, you add the feature extractor to the
* <code>featuresC</code> code, if both the current and previous class,
* then <code>featuresCpC</code>, etc.</li>
* </ol>
* <p> Parameters can be defined using a Properties file
* (specified on the command-line with <code>-prop</code> <i>propFile</i>),
* or directly on the command line. The following properties are recognized:
* </p>
* <table border="1">
* <tr><td><b>Property Name</b></td><td><b>Type</b></td><td><b>Default Value</b></td><td><b>Description</b></td></tr>
* <tr><td> loadClassifier </td><td>String</td><td>n/a</td><td>Path to serialized classifier to load</td></tr>
* <tr><td> loadAuxClassifier </td><td>String</td><td>n/a</td><td>Path to auxiliary classifier to load.</td></tr>
* <tr><td> serializeTo</td><td>String</td><td>n/a</td><td>Path to serialize classifier to</td></tr>
* <tr><td> trainFile</td><td>String</td><td>n/a</td><td>Path of file to use as training data</td></tr>
* <tr><td> testFile</td><td>String</td><td>n/a</td><td>Path of file to use as training data</td></tr>
* <p/>
* <tr><td> useWord</td><td>boolean</td><td>true</td><td>Gives you feature for w</td></tr>
* <tr><td> useBinnedLength</td><td>String</td><td>null</td><td>If non-null, treat as a sequence of comma separated integer bounds, where items above the previous bound up to the next bound are binned Len-<i>range</i></td></tr>
* <tr><td> useNGrams</td><td>boolean</td><td>false</td><td>Make features from letter n-grams</td></tr>
* <tr><td> lowercaseNGrams</td><td>boolean</td><td>false</td><td>Make features from letter n-grams only lowercase</td></tr>
* <tr><td> dehyphenateNGrams</td><td>boolean</td><td>false</td><td>Remove hyphens before making features from letter n-grams</td></tr>
* <tr><td> conjoinShapeNGrams</td><td>boolean</td><td>false</td><td>Conjoin word shape and n-gram features</td></tr>
* <tr><td> usePrev</td><td>boolean</td><td>false</td><td>Gives you feature for (pw,c), and together with other options enables other previous features, such as (pt,c) [with useTags)</td></tr>
* <tr><td> useNext</td><td>boolean</td><td>false</td><td>Gives you feature for (nw,c), and together with other options enables other next features, such as (nt,c) [with useTags)</td></tr>
* <tr><td> useTags</td><td>boolean</td><td>false</td><td>Gives you features for (t,c), (pt,c) [if usePrev], (nt,c) [if useNext]</td></tr>
* <tr><td> useWordPairs</td><td>boolean</td><td>false</td><td>Gives you
* features for (pw, w, c) and (w, nw, c)</td></tr>
* <p>
* <tr><td> useGazettes</td><td>boolean</td><td>false</td><td>If true, use gazette features (defined by other flags)</td></tr>
* <tr><td> gazette</td><td>String</td><td>null</td><td>The value can be one or more filenames (names separated by a comma, semicolon or space).
* If provided gazettes are loaded from these files. Each line should be an entity class name, followed by whitespace followed by an entity (which might be a phrase of several tokens with a single space between words).
* Giving this property turns on useGazettes, so you normally don't need to specify it (but can use it to turn off gazettes specified in a properties file).</td></tr>
* <tr><td> sloppyGazette</td><td>boolean</td><td>false</td><td>If true, a gazette feature fires when any token of a gazette entry matches</td></tr>
* <tr><td> cleanGazette</td><td>boolean</td><td>false</td><td></td>If true, a gazette feature fires when all tokens of a gazette entry match</tr>
* <p>
* <tr><td> wordShape</td><td>String</td><td>none</td><td>Either "none" for no wordShape use, or the name of a word shape function recognized by {@link WordShapeClassifier#lookupShaper(String)}</td></tr>
* <tr><td> useSequences</td><td>boolean</td><td>true</td><td></td></tr>
* <tr><td> usePrevSequences</td><td>boolean</td><td>false</td><td></td></tr>
* <tr><td> useNextSequences</td><td>boolean</td><td>false</td><td></td></tr>
* <tr><td> useLongSequences</td><td>boolean</td><td>false</td><td>Use plain higher-order state sequences out to minimum of length or maxLeft</td></tr>
* <tr><td> useBoundarySequences</td><td>boolean</td><td>false</td><td>Use extra second order class sequence features when previous is CoNLL boundary, so entity knows it can span boundary.</td></tr>
* <tr><td> useTaggySequences</td><td>boolean</td><td>false</td><td>Use first, second, and third order class and tag sequence interaction features</td></tr>
* <tr><td> useExtraTaggySequences</td><td>boolean</td><td>false</td><td>Add in sequences of tags with just current class features</td></tr>
* <tr><td> useTaggySequencesShapeInteraction</td><td>boolean</td><td>false</td><td>Add in terms that join sequences of 2 or 3 tags with the current shape</td></tr>
* <tr><td> strictlyFirstOrder</td><td>boolean</td><td>false</td><td>As an override to whatever other options are in effect, deletes all features other than C and CpC clique features when building the classifier</td></tr>
* <tr><td> entitySubclassification</td><td>String</td><td>"IO"</td><td>If
* set, convert the labeling of classes (but not the background) into
* one of several alternate encodings (IO, IOB1, IOB2, IOE1, IOE2, SBIEO, with
* a S(ingle), B(eginning),
* E(nding), I(nside) 4-way classification for each class. By default, we
* either do no re-encoding, or the CoNLLDocumentIteratorFactory does a
* lossy encoding as IO. Note that this is all CoNLL-specific, and depends on
* their way of prefix encoding classes, and is only implemented by
* the CoNLLDocumentIteratorFactory. </td></tr>
* <p/>
* <tr><td> useSum</td><td>boolean</td><td>false</td><td></td></tr>
* <tr><td> tolerance</td><td>double</td><td>1e-4</td><td>Convergence tolerance in optimization</td></tr>
* <tr><td> printFeatures</td><td>String</td><td>null</td><td>print out the features of the classifier to a file based on this name (starting with feat-, suffixed "-1" and "-2")</td></tr>
* <tr><td> printFeaturesUpto</td><td>int</td><td>-1</td><td>Print out features for only the first this many datums, if the value is positive. </td></tr>
* <p/>
* <tr><td> useSymTags</td><td>boolean</td><td>false</td><td>Gives you
* features (pt, t, nt, c), (t, nt, c), (pt, t, c)</td></tr>
* <tr><td> useSymWordPairs</td><td>boolean</td><td>false</td><td>Gives you
* features (pw, nw, c)</td></tr>
* <p/>
* <tr><td> printClassifier</td><td>String</td><td>null</td><td>Style in which to print the classifier. One of: HighWeight, HighMagnitude, Collection, AllWeights, WeightHistogram</td></tr>
* <tr><td> printClassifierParam</td><td>int</td><td>100</td><td>A parameter
* to the printing style, which may give, for example the number of parameters
* to print</td></tr>
* <tr><td> intern</td><td>boolean</td><td>false</td><td>If true,
* (String) intern read in data and classes and feature (pre-)names such
* as substring features</td></tr>
* <tr><td> intern2</td><td>boolean</td><td>false</td><td>If true, intern all (final) feature names (if only current word and ngram features are used, these will already have been interned by intern, and this is an unnecessary no-op)</td></tr>
* <tr><td> cacheNGrams</td><td>boolean</td><td>false</td><td>If true,
* record the NGram features that correspond to a String (under the current
* option settings) and reuse rather than recalculating if the String is seen
* again.</td></tr>
* <tr><td> selfTest</td><td>boolean</td><td>false</td><td></td></tr>
* <p/>
* <tr><td> noMidNGrams</td><td>boolean</td><td>false</td><td>Do not include character n-gram features for n-grams that contain neither the beginning or end of the word</td></tr>
* <tr><td> maxNGramLeng</td><td>int</td><td>-1</td><td>If this number is
* positive, n-grams above this size will not be used in the model</td></tr>
* <tr><td> useReverse</td><td>boolean</td><td>false</td><td></td></tr>
* <tr><td> retainEntitySubclassification</td><td>boolean</td><td>false</td><td>If true, rather than undoing a recoding of entity tag subtypes (such as BIO variants), just leave them in the output.</td></tr>
* <tr><td> useLemmas</td><td>boolean</td><td>false</td><td>Include the lemma of a word as a feature.</td></tr>
* <tr><td> usePrevNextLemmas</td><td>boolean</td><td>false</td><td>Include the previous/next lemma of a word as a feature.</td></tr>
* <tr><td> useLemmaAsWord</td><td>boolean</td><td>false</td><td>Include the lemma of a word as a feature.</td></tr>
* <tr><td> normalizeTerms</td><td>boolean</td><td>false</td><td>If this is true, some words are normalized: day and month names are lowercased (as for normalizeTimex) and some British spellings are mapped to American English spellings (e.g., -our/-or, etc.).</td></tr>
* <tr><td> normalizeTimex</td><td>boolean</td><td>false</td><td>If this is true, capitalization of day and month names is normalized to lowercase</td></tr>
* <tr><td> useNB</td><td>boolean</td><td>false</td><td></td></tr>
* <tr><td> useTypeSeqs</td><td>boolean</td><td>false</td><td>Use basic zeroeth order word shape features.</td></tr>
* <tr><td> useTypeSeqs2</td><td>boolean</td><td>false</td><td>Add additional first and second order word shape features</td></tr>
* <tr><td> useTypeSeqs3</td><td>boolean</td><td>false</td><td>Adds one more first order shape sequence</td></tr>
* <tr><td> useDisjunctive</td><td>boolean</td><td>false</td><td>Include in features giving disjunctions of words anywhere in the left or right disjunctionWidth words (preserving direction but not position)</td></tr>
* <tr><td> disjunctionWidth</td><td>int</td><td>4</td><td>The number of words on each side of the current word that are included in the disjunction features</td></tr>
* <tr><td> useDisjunctiveShapeInteraction</td><td>boolean</td><td>false</td><td>Include in features giving disjunctions of words anywhere in the left or right disjunctionWidth words (preserving direction but not position) interacting with the word shape of the current word</td></tr>
* <tr><td> useWideDisjunctive</td><td>boolean</td><td>false</td><td>Include in features giving disjunctions of words anywhere in the left or right wideDisjunctionWidth words (preserving direction but not position)</td></tr>
* <tr><td> wideDisjunctionWidth</td><td>int</td><td>4</td><td>The number of words on each side of the current word that are included in the disjunction features</td></tr>
* <tr><td> usePosition</td><td>boolean</td><td>false</td><td>Use combination of position in sentence and class as a feature</td></tr>
* <tr><td> useBeginSent</td><td>boolean</td><td>false</td><td>Use combination of initial position in sentence and class (and word shape) as a feature. (Doesn't seem to help.)</td></tr>
* <tr><td> useDisjShape</td><td>boolean</td><td>false</td><td>Include features giving disjunctions of word shapes anywhere in the left or right disjunctionWidth words (preserving direction but not position)</td></tr>
* <tr><td> useClassFeature</td><td>boolean</td><td>false</td><td>Include a feature for the class (as a class marginal)</td></tr>
* <tr><td> useShapeConjunctions</td><td>boolean</td><td>false</td><td>Conjoin shape with tag or position</td></tr>
* <tr><td> useWordTag</td><td>boolean</td><td>false</td><td>Include word and tag pair features</td></tr>
* <tr><td> useLastRealWord</td><td>boolean</td><td>false</td><td>Iff the prev word is of length 3 or less, add an extra feature that combines the word two back and the current word's shape. <i>Weird!</i></td></tr>
* <tr><td> useNextRealWord</td><td>boolean</td><td>false</td><td>Iff the next word is of length 3 or less, add an extra feature that combines the word after next and the current word's shape. <i>Weird!</i></td></tr>
* <tr><td> useTitle</td><td>boolean</td><td>false</td><td>Match a word against a list of name titles (Mr, Mrs, etc.)</td></tr>
* <tr><td> useOccurrencePatterns</td><td>boolean</td><td>false</td><td>This is a very engineered feature designed to capture multiple references to names. If the current word isn't capitalized, followed by a non-capitalized word, and preceded by a word with alphabetic characters, it returns NO-OCCURRENCE-PATTERN. Otherwise, if the previous word is a capitalized NNP, then if in the next 150 words you find this PW-W sequence, you get XY-NEXT-OCCURRENCE-XY, else if you find W you get XY-NEXT-OCCURRENCE-Y. Similarly for backwards and XY-PREV-OCCURRENCE-XY and XY-PREV-OCCURRENCE-Y. Else (if the previous word isn't a capitalized NNP), under analogous rules you get one or more of X-NEXT-OCCURRENCE-YX, X-NEXT-OCCURRENCE-XY, X-NEXT-OCCURRENCE-X, X-PREV-OCCURRENCE-YX, X-PREV-OCCURRENCE-XY, X-PREV-OCCURRENCE-X.</td></tr>
* <tr><td> useTypeySequences</td><td>boolean</td><td>false</td><td>Some first order word shape patterns.</td></tr>
* <tr><td> useGenericFeatures</td><td>boolean</td><td>false</td><td>If true, any features you include in the map will be incorporated into the model with values equal to those given in the file; values are treated as strings unless you use the "realValued" option (described below)</td></tr>
* <tr><td> justify</td><td>boolean</td><td>false</td><td>Print out all
* feature/class pairs and their weight, and then for each input data
* point, print justification (weights) for active features</td></tr>
* <tr><td> normalize</td><td>boolean</td><td>false</td><td>For the CMMClassifier (only) if this is true then the Scorer normalizes scores as probabilities.</td></tr>
* <tr><td> useHuber</td><td>boolean</td><td>false</td><td>Use a Huber loss prior rather than the default quadratic loss.</td></tr>
* <tr><td> useQuartic</td><td>boolean</td><td>false</td><td>Use a Quartic prior rather than the default quadratic loss.</td></tr>
* <tr><td> sigma</td><td>double</td><td>1.0</td><td></td></tr>
* <tr><td> epsilon</td><td>double</td><td>0.01</td><td>Used only as a parameter in the Huber loss: this is the distance from 0 at which the loss changes from quadratic to linear</td></tr>
* <tr><td> beamSize</td><td>int</td><td>30</td><td></td></tr>
* <tr><td> maxLeft</td><td>int</td><td>2</td><td>The number of things to the left that have to be cached to run the Viterbi algorithm: the maximum context of class features used.</td></tr>
* <tr><td> dontExtendTaggy</td><td>boolean</td><td>false</td><td>Don't extend the range of useTaggySequences when maxLeft is increased.</td></tr>
* <tr><td> numFolds </td><td>int</td><td>1</td><td>The number of folds to use for cross-validation.</td></tr>
* <tr><td> startFold </td><td>int</td><td>1</td><td>The starting fold to run.</td></tr>
* <tr><td> numFoldsToRun </td><td>int</td><td>1</td><td>The number of folds to run.</td></tr>
* <tr><td> mergeTags </td><td>boolean</td><td>false</td><td>Whether to merge B- and I- tags.</td></tr>
* <tr><td> splitDocuments</td><td>boolean</td><td>true</td><td>Whether or not to split the data into separate documents for training/testing</td></tr>
* <tr><td> maxDocSize</td><td>int</td><td>10000</td><td>If this number is greater than 0, attempt to split documents bigger than this value into multiple documents at sentence boundaries during testing; otherwise do nothing.</td></tr>
* </table>
* <p/>
* Note: flags/properties overwrite left to right. That is, the parameter
* setting specified <i>last</i> is the one used.
* <p/>
* <pre>
* DOCUMENTATION ON FEATURE TEMPLATES
* <p/>
* w = word
* t = tag
* p = position (word index in sentence)
* c = class
* p = paren
* g = gazette
* a = abbrev
* s = shape
* r = regent (dependency governor)
* h = head word of phrase
* n(w) = ngrams from w
* g(w) = gazette entries containing w
* l(w) = length of w
* o(...) = occurrence patterns of words
* <p/>
* useReverse reverses meaning of prev, next everywhere below (on in macro)
* <p/>
* "Prolog" booleans: , = AND and ; = OR
* <p/>
* Mac: Y = turned on in -macro,
* + = additional positive things relative to -macro for CoNLL NERFeatureFactory
* (perhaps none...)
* - = Known negative for CoNLL NERFeatureFactory relative to -macro
* <p/>p
* Bio: + = additional things that are positive for BioCreative
* - = things negative relative to -macro
* <p/>
* HighMagnitude: There are no (0) to a few (+) to many (+++) high weight
* features of this template. (? = not used in goodCoNLL, but usually = 0)
* <p/>
* Feature Mac Bio CRFFlags HighMagnitude
* ---------------------------------------------------------------------
* w,c Y useWord 0 (useWord is almost useless with unlimited ngram features, but helps a fraction in goodCoNLL, if only because of prior fiddling
* p,c usePosition ?
* p=0,c useBeginSent ?
* p=0,s,c useBeginSent ?
* t,c Y useTags ++
* pw,c Y usePrev +
* pt,c Y usePrev,useTags 0
* nw,c Y useNext ++
* nt,c Y useNext,useTags 0
* pw,w,c Y useWordPairs +
* w,nw,c Y useWordPairs +
* pt,t,nt,c useSymTags ?
* t,nt,c useSymTags ?
* pt,t,c useSymTags ?
* pw,nw,c useSymWordPairs ?
* <p/>
* pc,c Y usePrev,useSequences,usePrevSequences +++
* pc,w,c Y usePrev,useSequences,usePrevSequences 0
* nc,c useNext,useSequences,useNextSequences ?
* w,nc,c useNext,useSequences,useNextSequences ?
* pc,nc,c useNext,usePrev,useSequences,usePrevSequences,useNextSequences ?
* w,pc,nc,c useNext,usePrev,useSequences,usePrevSequences,useNextSequences ?
* <p/>
* (pw;p2w;p3w;p4w),c + useDisjunctive (out to disjunctionWidth now) +++
* (nw;n2w;n3w;n4w),c + useDisjunctive (out to disjunctionWidth now) ++++
* (pw;p2w;p3w;p4w),s,c + useDisjunctiveShapeInteraction ?
* (nw;n2w;n3w;n4w),s,c + useDisjunctiveShapeInteraction ?
* (pw;p2w;p3w;p4w),c + useWideDisjunctive (to wideDisjunctionWidth) ?
* (nw;n2w;n3w;n4w),c + useWideDisjunctive (to wideDisjunctionWidth) ?
* (ps;p2s;p3s;p4s),c useDisjShape (out to disjunctionWidth now) ?
* (ns;n2s;n3s;n4s),c useDisjShape (out to disjunctionWidth now) ?
* <p/>
* pt,pc,t,c Y useTaggySequences +
* p2t,p2c,pt,pc,t,c Y useTaggySequences,maxLeft>=2 +
* p3t,p3c,p2t,p2c,pt,pc,t,c Y useTaggySequences,maxLeft>=3,!dontExtendTaggy ?
* p2c,pc,c Y useLongSequences ++
* p3c,p2c,pc,c Y useLongSequences,maxLeft>=3 ?
* p4c,p3c,p2c,pc,c Y useLongSequences,maxLeft>=4 ?
* p2c,pc,c,pw=BOUNDARY useBoundarySequences 0 (OK, but!)
* <p/>
* p2t,pt,t,c - useExtraTaggySequences ?
* p3t,p2t,pt,t,c - useExtraTaggySequences ?
* <p/>
* p2t,pt,t,s,p2c,pc,c - useTaggySequencesShapeInteraction ?
* p3t,p2t,pt,t,s,p3c,p2c,pc,c useTaggySequencesShapeInteraction ?
* <p/>
* s,pc,c Y useTypeySequences ++
* ns,pc,c Y useTypeySequences // error for ps? not? 0
* ps,pc,s,c Y useTypeySequences 0
* // p2s,p2c,ps,pc,s,c Y useTypeySequences,maxLeft>=2 // duplicated a useTypeSeqs2 feature
* <p/>
* n(w),c Y useNGrams (noMidNGrams, MaxNGramLeng, lowercaseNGrams, dehyphenateNGrams) +++
* n(w),s,c useNGrams,conjoinShapeNGrams ?
* <p/>
* g,c + useGazFeatures // test refining this? ?
* pg,pc,c + useGazFeatures ?
* ng,c + useGazFeatures ?
* // pg,g,c useGazFeatures ?
* // pg,g,ng,c useGazFeatures ?
* // p2g,p2c,pg,pc,g,c useGazFeatures ?
* g,w,c useMoreGazFeatures ?
* pg,pc,g,c useMoreGazFeatures ?
* g,ng,c useMoreGazFeatures ?
* <p/>
* g(w),c useGazette,sloppyGazette (contains same word) ?
* g(w),[pw,nw,...],c useGazette,cleanGazette (entire entry matches) ?
* <p/>
* s,c Y wordShape >= 0 +++
* ps,c Y wordShape >= 0,useTypeSeqs +
* ns,c Y wordShape >= 0,useTypeSeqs +
* pw,s,c Y wordShape >= 0,useTypeSeqs +
* s,nw,c Y wordShape >= 0,useTypeSeqs +
* ps,s,c Y wordShape >= 0,useTypeSeqs 0
* s,ns,c Y wordShape >= 0,useTypeSeqs ++
* ps,s,ns,c Y wordShape >= 0,useTypeSeqs ++
* pc,ps,s,c Y wordShape >= 0,useTypeSeqs,useTypeSeqs2 0
* p2c,p2s,pc,ps,s,c Y wordShape >= 0,useTypeSeqs,useTypeSeqs2,maxLeft>=2 +++
* pc,ps,s,ns,c wordShape >= 0,useTypeSeqs,useTypeSeqs3 ?
* <p/>
* p2w,s,c if l(pw) <= 3 Y useLastRealWord // weird features, but work 0
* n2w,s,c if l(nw) <= 3 Y useNextRealWord ++
* o(pw,w,nw),c Y useOccurrencePatterns // don't fully grok but has to do with capitalized name patterns ++
* <p/>
* a,c useAbbr;useMinimalAbbr
* pa,a,c useAbbr
* a,na,c useAbbr
* pa,a,na,c useAbbr
* pa,pc,a,c useAbbr;useMinimalAbbr
* p2a,p2c,pa,pc,a useAbbr
* w,a,c useMinimalAbbr
* p2a,p2c,a,c useMinimalAbbr
* <p/>
* RESTR. w,(pw,pc;p2w,p2c;p3w,p3c;p4w,p4c) + useParenMatching,maxLeft>=n
* <p/>
* c - useClassFeature
* <p/>
* p,s,c - useShapeConjunctions
* t,s,c - useShapeConjunctions
* <p/>
* w,t,c + useWordTag ?
* w,pt,c + useWordTag ?
* w,nt,c + useWordTag ?
* <p/>
* r,c useNPGovernor (only for baseNP words)
* r,t,c useNPGovernor (only for baseNP words)
* h,c useNPHead (only for baseNP words)
* h,t,c useNPHead (only for baseNP words)
* <p/>
* </pre>
*
* @author Dan Klein
* @author Jenny Finkel
* @author Christopher Manning
* @author Shipra Dingare
* @author Huy Nguyen
*/
public class NERFeatureFactory<IN extends CoreLabel> extends FeatureFactory<IN> {
private static final long serialVersionUID = -2329726064739185544L;
public NERFeatureFactory() {
super();
}
public void init(SeqClassifierFlags flags) {
super.init(flags);
initGazette();
if (flags.useDistSim) {
initLexicon(flags);
}
}
/**
* Extracts all the features from the input data at a certain index.
*
* @param cInfo The complete data set as a List of WordInfo
* @param loc The index at which to extract features.
*/
@Override
public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) {
Collection<String> features = new HashSet<String>();
boolean doFE = cInfo.get(0).containsKey(DomainAnnotation.class);
String domain = (doFE ? cInfo.get(0).get(DomainAnnotation.class) : null);
// System.err.println(doFE+"\t"+domain);
if (clique == cliqueC) {
//200710: tried making this clique null; didn't improve performance (rafferty)
Collection<String> c = featuresC(cInfo, loc);
addAllInterningAndSuffixing(features, c, "C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-C");
}
} else if (clique == cliqueCpC) {
Collection<String> c = featuresCpC(cInfo, loc);
addAllInterningAndSuffixing(features, c, "CpC");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-CpC");
}
c = featuresCnC(cInfo, loc-1);
addAllInterningAndSuffixing(features, c, "CnC");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-CnC");
}
} else if (clique == cliqueCp2C) {
Collection<String> c = featuresCp2C(cInfo, loc);
addAllInterningAndSuffixing(features, c, "Cp2C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-Cp2C");
}
} else if (clique == cliqueCp3C) {
Collection<String> c = featuresCp3C(cInfo, loc);
addAllInterningAndSuffixing(features, c, "Cp3C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-Cp3C");
}
} else if (clique == cliqueCp4C) {
Collection<String> c = featuresCp4C(cInfo, loc);
addAllInterningAndSuffixing(features, c, "Cp4C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-Cp4C");
}
} else if (clique == cliqueCp5C) {
Collection<String> c = featuresCp5C(cInfo, loc);
addAllInterningAndSuffixing(features, c, "Cp5C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-Cp5C");
}
} else if (clique == cliqueCpCp2C) {
Collection<String> c = featuresCpCp2C(cInfo, loc);
addAllInterningAndSuffixing(features, c, "CpCp2C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-CpCp2C");
}
c = featuresCpCnC(cInfo, loc-1);
addAllInterningAndSuffixing(features, c, "CpCnC");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-CpCnC");
}
} else if (clique == cliqueCpCp2Cp3C) {
Collection<String> c = featuresCpCp2Cp3C(cInfo, loc);
addAllInterningAndSuffixing(features, c, "CpCp2Cp3C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-CpCp2Cp3C");
}
} else if (clique == cliqueCpCp2Cp3Cp4C) {
Collection<String> c = featuresCpCp2Cp3Cp4C(cInfo, loc);
addAllInterningAndSuffixing(features, c, "CpCp2Cp3Cp4C");
if (doFE) {
addAllInterningAndSuffixing(features, c, domain+"-CpCp2Cp3Cp4C");
}
}
// System.err.println(StringUtils.join(features,"\n")+"\n");
return features;
}
// TODO: when breaking serialization, it seems like it would be better to
// move the lexicon into (Abstract)SequenceClassifier and to do this
// annotation as part of the ObjectBankWrapper. But note that it is
// serialized in this object currently and it would then need to be
// serialized elsewhere or loaded each time
private Map<String,String> lexicon;
private void initLexicon(SeqClassifierFlags flags) {
if (flags.distSimLexicon == null) {
return;
}
if (lexicon != null) {
return;
}
Timing.startDoing("Loading distsim lexicon from " + flags.distSimLexicon);
lexicon = new HashMap<String, String>();
boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat);
for (String line : ObjectBank.getLineIterator(flags.distSimLexicon,
flags.inputEncoding)) {
String word;
String wordClass;
if (terryKoo) {
String[] bits = line.split("\\t");
word = bits[1];
wordClass = bits[0];
if (flags.distSimMaxBits > 0 && wordClass.length() > flags.distSimMaxBits) {
wordClass = wordClass.substring(0, flags.distSimMaxBits);
}
} else {
// "alexClark"
String[] bits = line.split("\\s+");
word = bits[0];
wordClass = bits[1];
}
if ( ! flags.casedDistSim) {
word = word.toLowerCase();
}
if (flags.numberEquivalenceDistSim) {
word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
}
lexicon.put(word, wordClass);
}
Timing.endDoing();
}
private void distSimAnnotate(PaddedList<IN> info) {
for (CoreLabel fl : info) {
if (fl.has(DistSimAnnotation.class)) { return; }
String word = fl.getString(TextAnnotation.class);
if ( ! flags.casedDistSim) {
word = word.toLowerCase();
}
if (flags.numberEquivalenceDistSim) {
word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
}
String distSim = lexicon.get(word);
if (distSim == null) {
distSim = flags.unknownWordDistSimClass;
}
fl.set(DistSimAnnotation.class, distSim);
}
}
private Map<String,Collection<String>> wordToSubstrings = new HashMap<String,Collection<String>>();
public void clearMemory() {
wordToSubstrings = new HashMap<String,Collection<String>>();
lexicon = null;
}
private static String dehyphenate(String str) {
// don't take out leading or ending ones, just internal
// and remember padded with < > characters
String retStr = str;
int leng = str.length();
int hyphen = 2;
do {
hyphen = retStr.indexOf('-', hyphen);
if (hyphen >= 0 && hyphen < leng - 2) {
retStr = retStr.substring(0, hyphen) + retStr.substring(hyphen + 1);
} else {
hyphen = -1;
}
} while (hyphen >= 0);
return retStr;
}
private static String greekify(String str) {
// don't take out leading or ending ones, just internal
// and remember padded with < > characters
String pattern = "(alpha)|(beta)|(gamma)|(delta)|(epsilon)|(zeta)|(kappa)|(lambda)|(rho)|(sigma)|(tau)|(upsilon)|(omega)";
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(str);
return m.replaceAll("~");
}
/* end methods that do transformations */
/*
* static booleans that check strings for certain qualities *
*/
// cdm: this could be improved to handle more name types, such as
// O'Reilly, DeGuzman, etc. (need a little classifier?!?)
private static boolean isNameCase(String str) {
if (str.length() < 2) {
return false;
}
if (!(Character.isUpperCase(str.charAt(0)) || Character.isTitleCase(str.charAt(0)))) {
return false;
}
for (int i = 1; i < str.length(); i++) {
if (Character.isUpperCase(str.charAt(i))) {
return false;
}
}
return true;
}
private static boolean noUpperCase(String str) {
if (str.length() < 1) {
return false;
}
for (int i = 0; i < str.length(); i++) {
if (Character.isUpperCase(str.charAt(i))) {
return false;
}
}
return true;
}
private static boolean hasLetter(String str) {
if (str.length() < 1) {
return false;
}
for (int i = 0; i < str.length(); i++) {
if (Character.isLetter(str.charAt(i))) {
return true;
}
}
return false;
}
private static final Pattern ordinalPattern = Pattern.compile("(?:(?:first|second|third|fourth|fifth|"+
"sixth|seventh|eighth|ninth|tenth|"+
"eleventh|twelfth|thirteenth|"+
"fourteenth|fifteenth|sixteenth|"+
"seventeenth|eighteenth|nineteenth|"+
"twenty|twentieth|thirty|thirtieth|"+
"forty|fortieth|fifty|fiftieth|"+
"sixty|sixtieth|seventy|seventieth|"+
"eighty|eightieth|ninety|ninetieth|"+
"one|two|three|four|five|six|seven|"+
"eight|nine|hundred|hundredth)-?)+|[0-9]+(?:st|nd|rd|th)", Pattern.CASE_INSENSITIVE);
private static final Pattern numberPattern = Pattern.compile("[0-9]+");
private static final Pattern ordinalEndPattern = Pattern.compile("(?:st|nd|rd|th)", Pattern.CASE_INSENSITIVE);
private static boolean isOrdinal(List<? extends CoreLabel> wordInfos, int pos) {
CoreLabel c = wordInfos.get(pos);
Matcher m = ordinalPattern.matcher(c.getString(TextAnnotation.class));
if (m.matches()) { return true; }
m = numberPattern.matcher(c.getString(TextAnnotation.class));
if (m.matches()) {
if (pos+1 < wordInfos.size()) {
CoreLabel n = wordInfos.get(pos+1);
m = ordinalEndPattern.matcher(n.getString(TextAnnotation.class));
if (m.matches()) { return true; }
}
return false;
}
m = ordinalEndPattern.matcher(c.getString(TextAnnotation.class));
if (m.matches()) {
if (pos > 0) {
CoreLabel p = wordInfos.get(pos-1);
m = numberPattern.matcher(p.getString(TextAnnotation.class));
if (m.matches()) { return true; }
}
}
if (c.getString(TextAnnotation.class).equals("-")) {
if (pos+1 < wordInfos.size() && pos > 0) {
CoreLabel p = wordInfos.get(pos-1);
CoreLabel n = wordInfos.get(pos+1);
m = ordinalPattern.matcher(p.getString(TextAnnotation.class));
if (m.matches()) {
m = ordinalPattern.matcher(n.getString(TextAnnotation.class));
if (m.matches()) {
return true;
}
}
}
}
return false;
}
/* end static booleans that check strings for certain qualities */
/**
* Gazette Stuff.
*/
private static class GazetteInfo implements Serializable {
String feature = "";
int loc = 0;
String[] words = StringUtils.EMPTY_STRING_ARRAY;
private static final long serialVersionUID = -5903728481621584810L;
} // end class GazetteInfo
private Map<String,Collection<String>> wordToGazetteEntries = new HashMap<String,Collection<String>>();
private Map<String,Collection<GazetteInfo>> wordToGazetteInfos = new HashMap<String,Collection<GazetteInfo>>();
/** Reads a gazette file. Each line of it consists of a class name
* (a String not containing whitespace characters), followed by whitespace
* characters followed by a phrase, which is one or more tokens separated
* by a single space.
*
* @param in Where to read the gazette from
* @throws IOException If IO errors
*/
private void readGazette(BufferedReader in) throws IOException {
Pattern p = Pattern.compile("^(\\S+)\\s+(.+)$");
for (String line; (line = in.readLine()) != null; ) {
Matcher m = p.matcher(line);
if (m.matches()) {
String type = intern(m.group(1));
String phrase = m.group(2);
String[] words = phrase.split(" ");
for (int i = 0; i < words.length; i++) {
String word = intern(words[i]);
if (flags.sloppyGazette) {
Collection<String> entries = wordToGazetteEntries.get(word);
if (entries == null) {
entries = new HashSet<String>();
wordToGazetteEntries.put(word, entries);
}
String feature = intern(type + "-GAZ" + words.length);
entries.add(feature);
}
if (flags.cleanGazette) {
Collection<GazetteInfo> infos = wordToGazetteInfos.get(word);
if (infos == null) {
infos = new HashSet<GazetteInfo>();
wordToGazetteInfos.put(word, infos);
}
GazetteInfo info = new GazetteInfo();
info.loc = i;
info.words = words;
info.feature = intern(type + "-GAZ" + words.length);
infos.add(info);
}
}
}
}
}
private HashSet<Class<? extends GenericAnnotation<?>>> genericAnnotationKeys; // = null; //cache which keys are generic annotations so we don't have to do too many instanceof checks
@SuppressWarnings({"unchecked", "SuspiciousMethodCalls"})
private void makeGenericKeyCache(CoreLabel c) {
genericAnnotationKeys = new HashSet<Class<? extends GenericAnnotation<?>>>();
for (Class<?> key : c.keySet()) {
if (CoreLabel.genericValues.containsKey(key)) {
Class<? extends GenericAnnotation<?>> genKey = (Class<? extends GenericAnnotation<?>>) key;
genericAnnotationKeys.add(genKey);
}
}
}
private HashSet<String> lastNames; // = null;
private HashSet<String> maleNames; // = null;
private HashSet<String> femaleNames; // = null;
private final Pattern titlePattern = Pattern.compile("(Mr|Ms|Mrs|Dr|Miss|Sen|Judge|Sir)\\.?"); // todo: should make static final and add more titles
protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel n = cInfo.get(loc + 1);
CoreLabel n2 = cInfo.get(loc + 2);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
String cWord = c.getString(TextAnnotation.class);
String pWord = p.getString(TextAnnotation.class);
String nWord = n.getString(TextAnnotation.class);
String cShape = c.getString(ShapeAnnotation.class);
Collection<String> featuresC = new ArrayList<String>();
if (flags.useDistSim) {
distSimAnnotate(cInfo);
}
if (flags.useDistSim && flags.useMoreTags) {
featuresC.add(p.get(DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD");
}
if (flags.useDistSim) {
featuresC.add(c.get(DistSimAnnotation.class) + "-DISTSIM");
}
if (flags.useTitle) {
Matcher m = titlePattern.matcher(cWord);
if (m.matches()) {
featuresC.add("IS_TITLE");
}
}
if (flags.useInternal && flags.useExternal ) {
if (flags.useWord) {
featuresC.add(cWord + "-WORD");
}
if (flags.use2W) {
featuresC.add(p2.getString(TextAnnotation.class) + "-P2W");
featuresC.add(n2.getString(TextAnnotation.class) + "-N2W");
}
if (flags.useLC) {
featuresC.add(cWord.toLowerCase() + "-CL");
featuresC.add(pWord.toLowerCase() + "-PL");
featuresC.add(nWord.toLowerCase() + "-NL");
}
if (flags.useUnknown) { // for true casing
featuresC.add(c.get(UnknownAnnotation.class)+"-UNKNOWN");
featuresC.add(p.get(UnknownAnnotation.class)+"-PUNKNOWN");
featuresC.add(n.get(UnknownAnnotation.class)+"-NUNKNOWN");
}
if (flags.useLemmas) {
String lem = c.getString(LemmaAnnotation.class);
if (! "".equals(lem)) {
featuresC.add(lem + "-LEM");
}
}
if (flags.usePrevNextLemmas) {
String plem = p.getString(LemmaAnnotation.class);
String nlem = n.getString(LemmaAnnotation.class);
if (! "".equals(plem)) {
featuresC.add(plem + "-PLEM");
}
if (! "".equals(nlem)) {
featuresC.add(nlem + "-NLEM");
}
}
if (flags.checkNameList) {
try {
if (lastNames == null) {
lastNames = new HashSet<String>();
for (String line : ObjectBank.getLineIterator(flags.lastNameList)) {
String[] cols = line.split("\\s+");
lastNames.add(cols[0]);
}
}
if (maleNames == null) {
maleNames = new HashSet<String>();
for (String line : ObjectBank.getLineIterator(flags.maleNameList)) {
String[] cols = line.split("\\s+");
maleNames.add(cols[0]);
}
}
if (femaleNames == null) {
femaleNames = new HashSet<String>();
for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) {
String[] cols = line.split("\\s+");
femaleNames.add(cols[0]);
}
}
String name = cWord.toUpperCase();
if (lastNames.contains(name)) {
featuresC.add("LAST_NAME");
}
if (maleNames.contains(name)) {
featuresC.add("MALE_NAME");
}
if (femaleNames.contains(name)) {
featuresC.add("FEMALE_NAME");
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
if (flags.binnedLengths != null) {
int len = cWord.length();
String featureName = null;
for (int i = 0; i <= flags.binnedLengths.length; i++) {
if (i == flags.binnedLengths.length) {
featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf";
} else if (len <= flags.binnedLengths[i]) {
featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-' + flags.binnedLengths[i];
break;
}
}
featuresC.add(featureName);
}
if (flags.useABGENE) {
featuresC.add(c.get(AbgeneAnnotation.class) + "-ABGENE");
featuresC.add(p.get(AbgeneAnnotation.class) + "-PABGENE");
featuresC.add(n.get(AbgeneAnnotation.class) + "-NABGENE");
}
if (flags.useABSTRFreqDict) {
featuresC.add(c.get(AbstrAnnotation.class) + "-ABSTRACT" + c.get(FreqAnnotation.class) + "-FREQ" + c.getString(PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(AbstrAnnotation.class) + "-ABSTRACT" + c.get(DictAnnotation.class) + "-DICT" + c.getString(PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(AbstrAnnotation.class) + "-ABSTRACT" + c.get(DictAnnotation.class) + "-DICT" + c.get(FreqAnnotation.class) + "-FREQ" + c.getString(PartOfSpeechAnnotation.class) + "-TAG");
}
if (flags.useABSTR) {
featuresC.add(c.get(AbstrAnnotation.class) + "-ABSTRACT");
featuresC.add(p.get(AbstrAnnotation.class) + "-PABSTRACT");
featuresC.add(n.get(AbstrAnnotation.class) + "-NABSTRACT");
}
if (flags.useGENIA) {
featuresC.add(c.get(GeniaAnnotation.class) + "-GENIA");
featuresC.add(p.get(GeniaAnnotation.class) + "-PGENIA");
featuresC.add(n.get(GeniaAnnotation.class) + "-NGENIA");
}
if (flags.useWEBFreqDict) {
featuresC.add(c.get(WebAnnotation.class) + "-WEB" + c.get(FreqAnnotation.class) + "-FREQ" + c.getString(PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(WebAnnotation.class) + "-WEB" + c.get(DictAnnotation.class) + "-DICT" + c.getString(PartOfSpeechAnnotation.class) + "-TAG");
featuresC.add(c.get(WebAnnotation.class) + "-WEB" + c.get(DictAnnotation.class) + "-DICT" + c.get(FreqAnnotation.class) + "-FREQ" + c.getString(PartOfSpeechAnnotation.class) + "-TAG");
}
if (flags.useWEB) {
featuresC.add(c.get(WebAnnotation.class) + "-WEB");
featuresC.add(p.get(WebAnnotation.class) + "-PWEB");
featuresC.add(n.get(WebAnnotation.class) + "-NWEB");
}
if (flags.useIsURL) {
featuresC.add(c.get(IsURLAnnotation.class) + "-ISURL");
}
if (flags.useEntityRule) {
featuresC.add(c.get(EntityRuleAnnotation.class)+"-ENTITYRULE");
}
if (flags.useEntityTypes) {
featuresC.add(c.get(EntityTypeAnnotation.class) + "-ENTITYTYPE");
}
if (flags.useIsDateRange) {
featuresC.add(c.get(IsDateRangeAnnotation.class) + "-ISDATERANGE");
}
if (flags.useABSTRFreq) {
featuresC.add(c.get(AbstrAnnotation.class) + "-ABSTRACT" + c.get(FreqAnnotation.class) + "-FREQ");
}
if (flags.useFREQ) {
featuresC.add(c.get(FreqAnnotation.class) + "-FREQ");
}
if (flags.useMoreTags) {
featuresC.add(p.getString(PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD");
}
if (flags.usePosition) {
featuresC.add(c.get(PositionAnnotation.class) + "-POSITION");
}
if (flags.useBeginSent) {
if ("0".equals(c.get(PositionAnnotation.class))) {
featuresC.add("BEGIN-SENT");
featuresC.add(cShape + "-BEGIN-SENT");
} else {
featuresC.add("IN-SENT");
featuresC.add(cShape + "-IN-SENT");
}
}
if (flags.useTags) {
featuresC.add(c.getString(PartOfSpeechAnnotation.class) + "-TAG");
}
if (flags.useOrdinal) {
if (isOrdinal(cInfo, loc)) {
featuresC.add("C_ORDINAL");
if (isOrdinal(cInfo, loc-1)) {
//System.err.print(p.getString(TextAnnotation.class)+" ");
featuresC.add("PC_ORDINAL");
}
//System.err.println(c.getString(TextAnnotation.class));
}
if (isOrdinal(cInfo, loc-1)) {
featuresC.add("P_ORDINAL");
}
}
if (flags.usePrev) {
featuresC.add(p.getString(TextAnnotation.class) + "-PW");
if (flags.useTags) {
featuresC.add(p.getString(PartOfSpeechAnnotation.class) + "-PTAG");
}
if (flags.useDistSim) {
featuresC.add(p.get(DistSimAnnotation.class) + "-PDISTSIM");
}
if (flags.useIsURL) {
featuresC.add(p.get(IsURLAnnotation.class) + "-PISURL");
}
if (flags.useEntityTypes) {
featuresC.add(p.get(EntityTypeAnnotation.class) + "-PENTITYTYPE");
}
}
if (flags.useNext) {
featuresC.add(n.getString(TextAnnotation.class) + "-NW");
if (flags.useTags) {
featuresC.add(n.getString(PartOfSpeechAnnotation.class) + "-NTAG");
}
if (flags.useDistSim) {
featuresC.add(n.get(DistSimAnnotation.class) + "-NDISTSIM");
}
if (flags.useIsURL) {
featuresC.add(n.get(IsURLAnnotation.class) + "-NISURL");
}
if (flags.useEntityTypes) {
featuresC.add(n.get(EntityTypeAnnotation.class) + "-NENTITYTYPE");
}
}
/*here, entityTypes refers to the type in the PASCAL IE challenge:
* i.e. certain words are tagged "Date" or "Location" */
if (flags.useEitherSideWord) {
featuresC.add(pWord + "-EW");
featuresC.add(nWord + "-EW");
}
if (flags.useWordPairs) {
featuresC.add(cWord + '-' + pWord + "-W-PW");
featuresC.add(cWord + '-' + nWord + "-W-NW");
}
if (flags.useSymTags) {
if (flags.useTags) {
featuresC.add(p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + '-' + n.getString(PartOfSpeechAnnotation.class) + "-PCNTAGS");
featuresC.add(c.getString(PartOfSpeechAnnotation.class) + '-' + n.getString(PartOfSpeechAnnotation.class) + "-CNTAGS");
featuresC.add(p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + "-PCTAGS");
}
if (flags.useDistSim) {
featuresC.add(p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + '-' + n.get(DistSimAnnotation.class) + "-PCNDISTSIM");
featuresC.add(c.get(DistSimAnnotation.class) + '-' + n.get(DistSimAnnotation.class) + "-CNDISTSIM");
featuresC.add(p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + "-PCDISTSIM");
}
}
if (flags.useSymWordPairs) {
featuresC.add(pWord + '-' + nWord + "-SWORDS");
}
if (flags.useGazFeatures) {
if (!c.get(GazAnnotation.class).equals(flags.dropGaz)) {
featuresC.add(c.get(GazAnnotation.class) + "-GAZ");
}
if (!n.get(GazAnnotation.class).equals(flags.dropGaz)) {
featuresC.add(n.get(GazAnnotation.class) + "-NGAZ");
}
if (!p.get(GazAnnotation.class).equals(flags.dropGaz)) {
featuresC.add(p.get(GazAnnotation.class) + "-PGAZ");
}
}
if (flags.useMoreGazFeatures) {
if (!c.get(GazAnnotation.class).equals(flags.dropGaz)) {
featuresC.add(c.get(GazAnnotation.class) + '-' + cWord + "-CG-CW-GAZ");
if (!n.get(GazAnnotation.class).equals(flags.dropGaz)) {
featuresC.add(c.get(GazAnnotation.class) + '-' + n.get(GazAnnotation.class) + "-CNGAZ");
}
if (!p.get(GazAnnotation.class).equals(flags.dropGaz)) {
featuresC.add(p.get(GazAnnotation.class) + '-' + c.get(GazAnnotation.class) + "-PCGAZ");
}
}
}
if (flags.useAbbr || flags.useMinimalAbbr) {
featuresC.add(c.get(AbbrAnnotation.class) + "-ABBR");
}
if (flags.useAbbr1 || flags.useMinimalAbbr1) {
if (!c.get(AbbrAnnotation.class).equals("XX")) {
featuresC.add(c.get(AbbrAnnotation.class) + "-ABBR");
}
}
if (flags.useAbbr) {
featuresC.add(p.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-PCABBR");
featuresC.add(c.get(AbbrAnnotation.class) + '-' + n.get(AbbrAnnotation.class) + "-CNABBR");
featuresC.add(p.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + '-' + n.get(AbbrAnnotation.class) + "-PCNABBR");
}
if (flags.useAbbr1) {
if (!c.get(AbbrAnnotation.class).equals("XX")) {
featuresC.add(p.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-PCABBR");
featuresC.add(c.get(AbbrAnnotation.class) + '-' + n.get(AbbrAnnotation.class) + "-CNABBR");
featuresC.add(p.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + '-' + n.get(AbbrAnnotation.class) + "-PCNABBR");
}
}
if (flags.useChunks) {
featuresC.add(p.get(ChunkAnnotation.class) + '-' + c.get(ChunkAnnotation.class) + "-PCCHUNK");
featuresC.add(c.get(ChunkAnnotation.class) + '-' + n.get(ChunkAnnotation.class) + "-CNCHUNK");
featuresC.add(p.get(ChunkAnnotation.class) + '-' + c.get(ChunkAnnotation.class) + '-' + n.get(ChunkAnnotation.class) + "-PCNCHUNK");
}
if (flags.useMinimalAbbr) {
featuresC.add(cWord + '-' + c.get(AbbrAnnotation.class) + "-CWABB");
}
if (flags.useMinimalAbbr1) {
if (!c.get(AbbrAnnotation.class).equals("XX")) {
featuresC.add(cWord + '-' + c.get(AbbrAnnotation.class) + "-CWABB");
}
}
String prevVB = "", nextVB = "";
if (flags.usePrevVB) {
for (int j = loc - 1; ; j--) {
CoreLabel wi = cInfo.get(j);
if (wi == cInfo.getPad()) {
prevVB = "X";
featuresC.add("X-PVB");
break;
} else if (wi.getString(PartOfSpeechAnnotation.class).startsWith("VB")) {
featuresC.add(wi.getString(TextAnnotation.class) + "-PVB");
prevVB = wi.getString(TextAnnotation.class);
break;
}
}
}
if (flags.useNextVB) {
for (int j = loc + 1; ; j++) {
CoreLabel wi = cInfo.get(j);
if (wi == cInfo.getPad()) {
featuresC.add("X-NVB");
nextVB = "X";
break;
} else if (wi.getString(PartOfSpeechAnnotation.class).startsWith("VB")) {
featuresC.add(wi.getString(TextAnnotation.class) + "-NVB");
nextVB = wi.getString(TextAnnotation.class);
break;
}
}
}
if (flags.useVB) {
featuresC.add(prevVB + '-' + nextVB + "-PNVB");
}
if (flags.useShapeConjunctions) {
featuresC.add(c.get(PositionAnnotation.class) + cShape + "-POS-SH");
if (flags.useTags) {
featuresC.add(c.tag() + cShape + "-TAG-SH");
}
if (flags.useDistSim) {
featuresC.add(c.get(DistSimAnnotation.class) + cShape + "-DISTSIM-SH");
}
}
if (flags.useWordTag) {
featuresC.add(cWord + '-' + c.getString(PartOfSpeechAnnotation.class) + "-W-T");
featuresC.add(cWord + '-' + p.getString(PartOfSpeechAnnotation.class) + "-W-PT");
featuresC.add(cWord + '-' + n.getString(PartOfSpeechAnnotation.class) + "-W-NT");
}
if (flags.useNPHead) {
featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW");
if (flags.useTags) {
featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.getString(PartOfSpeechAnnotation.class) + "-HW-T");
}
if (flags.useDistSim) {
featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(DistSimAnnotation.class) + "-HW-DISTSIM");
}
}
if (flags.useNPGovernor) {
featuresC.add(c.get(GovernorAnnotation.class) + "-GW");
if (flags.useTags) {
featuresC.add(c.get(GovernorAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + "-GW-T");
}
if (flags.useDistSim) {
featuresC.add(c.get(GovernorAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + "-DISTSIM-T1");
}
}
if (flags.useHeadGov) {
featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(GovernorAnnotation.class) + "-HW_GW");
}
if (flags.useClassFeature) {
featuresC.add("###");
}
if (flags.useFirstWord) {
String firstWord = cInfo.get(0).getString(TextAnnotation.class);
featuresC.add(firstWord);
}
if (flags.useNGrams) {
Collection<String> subs = wordToSubstrings.get(cWord);
if (subs == null) {
subs = new ArrayList<String>();
String word = '<' + cWord + '>';
if (flags.lowercaseNGrams) {
word = word.toLowerCase();
}
if (flags.dehyphenateNGrams) {
word = dehyphenate(word);
}
if (flags.greekifyNGrams) {
word = greekify(word);
}
for (int i = 0; i < word.length(); i++) {
for (int j = i + 2; j <= word.length(); j++) {
if (flags.noMidNGrams && i != 0 && j != word.length()) {
continue;
}
if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
continue;
}
subs.add(intern('#' + word.substring(i, j) + '#'));
}
}
if (flags.cacheNGrams) {
wordToSubstrings.put(cWord, subs);
}
}
featuresC.addAll(subs);
if (flags.conjoinShapeNGrams) {
for (String str : subs) {
String feat = str + '-' + cShape + "-CNGram-CS";
featuresC.add(feat);
}
}
}
if (flags.useGazettes) {
if (flags.sloppyGazette) {
Collection<String> entries = wordToGazetteEntries.get(cWord);
if (entries != null) {
featuresC.addAll(entries);
}
}
if (flags.cleanGazette) {
Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord);
if (infos != null) {
for (GazetteInfo gInfo : infos) {
boolean ok = true;
for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) {
ok &= gInfo.words[gLoc].equals(cInfo.get(loc + gLoc - gInfo.loc).getString(TextAnnotation.class));
}
if (ok) {
featuresC.add(gInfo.feature);
}
}
}
}
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
featuresC.add(cShape + "-TYPE");
if (flags.useTypeSeqs) {
String pShape = p.get(ShapeAnnotation.class);
String nShape = n.get(ShapeAnnotation.class);
featuresC.add(pShape + "-PTYPE");
featuresC.add(nShape + "-NTYPE");
featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
featuresC.add(pShape + "..." + cShape + "-PCTYPE");
featuresC.add(cShape + "..." + nShape + "-CNTYPE");
featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
}
}
if (flags.useLastRealWord) {
if (pWord.length() <= 3) {
// extending this to check for 2 short words doesn't seem to help....
featuresC.add(p2.getString(TextAnnotation.class) + "..." + cShape + "-PPW_CTYPE");
}
}
if (flags.useNextRealWord) {
if (nWord.length() <= 3) {
// extending this to check for 2 short words doesn't seem to help....
featuresC.add(n2.getString(TextAnnotation.class) + "..." + cShape + "-NNW_CTYPE");
}
}
if (flags.useOccurrencePatterns) {
featuresC.addAll(occurrencePatterns(cInfo, loc));
}
if (flags.useDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
featuresC.add(dn.getString(TextAnnotation.class) + "-DISJN");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(dn.getString(TextAnnotation.class) + '-' + cShape + "-DISJN-CS");
}
featuresC.add(dp.getString(TextAnnotation.class) + "-DISJP");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(dp.getString(TextAnnotation.class) + '-' + cShape + "-DISJP-CS");
}
}
}
if (flags.useWideDisjunctive) {
for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
featuresC.add(cInfo.get(loc + i).getString(TextAnnotation.class) + "-DISJWN");
featuresC.add(cInfo.get(loc - i).getString(TextAnnotation.class) + "-DISJWP");
}
}
if (flags.useEitherSideDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
featuresC.add(cInfo.get(loc + i).getString(TextAnnotation.class) + "-DISJWE");
featuresC.add(cInfo.get(loc - i).getString(TextAnnotation.class) + "-DISJWE");
}
}
if (flags.useDisjShape) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
featuresC.add(cInfo.get(loc + i).get(ShapeAnnotation.class) + "-NDISJSHAPE");
// featuresC.add(cInfo.get(loc - i).get(ShapeAnnotation.class) + "-PDISJSHAPE");
featuresC.add(cShape + '-' + cInfo.get(loc + i).get(ShapeAnnotation.class) + "-CNDISJSHAPE");
// featuresC.add(c.get(ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(ShapeAnnotation.class) + "-CPDISJSHAPE");
}
}
if (flags.useExtraTaggySequences) {
if (flags.useTags) {
featuresC.add(p2.getString(PartOfSpeechAnnotation.class) + '-' + p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + "-TTS");
featuresC.add(p3.getString(PartOfSpeechAnnotation.class) + '-' + p2.getString(PartOfSpeechAnnotation.class) + '-' + p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + "-TTTS");
}
if (flags.useDistSim) {
featuresC.add(p2.get(DistSimAnnotation.class) + '-' + p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + "-DISTSIM_TTS1");
featuresC.add(p3.get(DistSimAnnotation.class) + '-' + p2.get(DistSimAnnotation.class) + '-' + p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + "-DISTSIM_TTTS1");
}
}
if (flags.useMUCFeatures) {
featuresC.add(c.get(SectionAnnotation.class)+"-SECTION");
featuresC.add(c.get(WordPositionAnnotation.class)+"-WORD_POSITION");
featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class)+"-SENT_POSITION");
featuresC.add(c.get(ParaPositionAnnotation.class)+"-PARA_POSITION");
featuresC.add(c.get(WordPositionAnnotation.class)+ '-' +c.get(ShapeAnnotation.class)+"-WORD_POSITION_SHAPE");
}
} else if (flags.useInternal) {
if (flags.useWord) {
featuresC.add(cWord + "-WORD");
}
if (flags.useNGrams) {
Collection<String> subs = wordToSubstrings.get(cWord);
if (subs == null) {
subs = new ArrayList<String>();
String word = '<' + cWord + '>';
if (flags.lowercaseNGrams) {
word = word.toLowerCase();
}
if (flags.dehyphenateNGrams) {
word = dehyphenate(word);
}
if (flags.greekifyNGrams) {
word = greekify(word);
}
for (int i = 0; i < word.length(); i++) {
for (int j = i + 2; j <= word.length(); j++) {
if (flags.noMidNGrams && i != 0 && j != word.length()) {
continue;
}
if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
continue;
}
//subs.add(intern("#" + word.substring(i, j) + "#"));
subs.add(intern('#' + word.substring(i, j) + '#'));
}
}
if (flags.cacheNGrams) {
wordToSubstrings.put(cWord, subs);
}
}
featuresC.addAll(subs);
if (flags.conjoinShapeNGrams) {
String shape = c.get(ShapeAnnotation.class);
for (String str : subs) {
String feat = str + '-' + shape + "-CNGram-CS";
featuresC.add(feat);
}
}
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
featuresC.add(cShape + "-TYPE");
}
if (flags.useOccurrencePatterns) {
featuresC.addAll(occurrencePatterns(cInfo, loc));
}
} else if (flags.useExternal) {
if (flags.usePrev) {
featuresC.add(pWord + "-PW");
}
if (flags.useNext) {
featuresC.add(nWord + "-NW");
}
if (flags.useWordPairs) {
featuresC.add(cWord + '-' + pWord + "-W-PW");
featuresC.add(cWord + '-' + nWord + "-W-NW");
}
if (flags.useSymWordPairs) {
featuresC.add(pWord + '-' + nWord + "-SWORDS");
}
if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
if (flags.useTypeSeqs) {
String pShape = p.get(ShapeAnnotation.class);
String nShape = n.get(ShapeAnnotation.class);
featuresC.add(pShape + "-PTYPE");
featuresC.add(nShape + "-NTYPE");
featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
if (flags.maxLeft > 0) featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps. Might be useful 0th-order
featuresC.add(cShape + "..." + nShape + "-CNTYPE");
featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
}
}
if (flags.useLastRealWord) {
if (pWord.length() <= 3) {
featuresC.add(p2.getString(TextAnnotation.class) + "..." + cShape + "-PPW_CTYPE");
}
}
if (flags.useNextRealWord) {
if (nWord.length() <= 3) {
featuresC.add(n2.getString(TextAnnotation.class) + "..." + cShape + "-NNW_CTYPE");
}
}
if (flags.useDisjunctive) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
CoreLabel dn = cInfo.get(loc + i);
CoreLabel dp = cInfo.get(loc - i);
featuresC.add(dn.getString(TextAnnotation.class) + "-DISJN");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(dn.getString(TextAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-DISJN-CS");
}
featuresC.add(dp.getString(TextAnnotation.class) + "-DISJP");
if (flags.useDisjunctiveShapeInteraction) {
featuresC.add(dp.getString(TextAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-DISJP-CS");
}
}
}
if (flags.useWideDisjunctive) {
for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
featuresC.add(cInfo.get(loc + i).getString(TextAnnotation.class) + "-DISJWN");
featuresC.add(cInfo.get(loc - i).getString(TextAnnotation.class) + "-DISJWP");
}
}
if (flags.useDisjShape) {
for (int i = 1; i <= flags.disjunctionWidth; i++) {
featuresC.add(cInfo.get(loc + i).get(ShapeAnnotation.class) + "-NDISJSHAPE");
// featuresC.add(cInfo.get(loc - i).get(ShapeAnnotation.class) + "-PDISJSHAPE");
featuresC.add(c.get(ShapeAnnotation.class) + '-' + cInfo.get(loc + i).get(ShapeAnnotation.class) + "-CNDISJSHAPE");
// featuresC.add(c.get(ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(ShapeAnnotation.class) + "-CPDISJSHAPE");
}
}
}
// Stuff to add binary features from the additional columns
if (flags.twoStage) {
featuresC.add(c.get(Bin1Annotation.class) + "-BIN1");
featuresC.add(c.get(Bin2Annotation.class) + "-BIN2");
featuresC.add(c.get(Bin3Annotation.class) + "-BIN3");
featuresC.add(c.get(Bin4Annotation.class) + "-BIN4");
featuresC.add(c.get(Bin5Annotation.class) + "-BIN5");
featuresC.add(c.get(Bin6Annotation.class) + "-BIN6");
}
if(flags.useIfInteger){
try {
int val = Integer.parseInt(cWord);
if(val > 0) featuresC.add("POSITIVE_INTEGER");
else if(val < 0) featuresC.add("NEGATIVE_INTEGER");
// System.err.println("FOUND INTEGER");
} catch(NumberFormatException e){
// not an integer value, nothing to do
}
}
//Stuff to add arbitrary features
if (flags.useGenericFeatures) {
//see if we need to cach the keys
if (genericAnnotationKeys == null) {
makeGenericKeyCache(c);
}
//now look through the cached keys
for (Class<? extends GenericAnnotation> key : genericAnnotationKeys) {
System.err.println("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key));
featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key));
}
}
if(flags.useTopics){
//featuresC.add(p.get(TopicAnnotation.class) + '-' + cWord + "--CWORD");
featuresC.add(c.get(TopicAnnotation.class)+ "-TopicID");
featuresC.add(p.get(TopicAnnotation.class) + "-PTopicID");
featuresC.add(n.get(TopicAnnotation.class) + "-NTopicID");
//featuresC.add(p.get(TopicAnnotation.class) + '-' + c.get(TopicAnnotation.class) + '-' + n.get(TopicAnnotation.class) + "-PCNTopicID");
//featuresC.add(c.get(TopicAnnotation.class) + '-' + n.get(TopicAnnotation.class) + "-CNTopicID");
//featuresC.add(p.get(TopicAnnotation.class) + '-' + c.get(TopicAnnotation.class) + "-PCTopicID");
//featuresC.add(c.get(TopicAnnotation.class) + cShape + "-TopicID-SH");
//asdasd
}
// NER tag annotations from a previous NER system
if (c.get(StackedNamedEntityTagAnnotation.class) != null) {
featuresC.add(c.get(StackedNamedEntityTagAnnotation.class)+ "-CStackedNERTag");
featuresC.add(cWord + "-" + c.get(StackedNamedEntityTagAnnotation.class)+ "-WCStackedNERTag");
if (flags.useNext) {
featuresC.add(c.get(StackedNamedEntityTagAnnotation.class) + '-' + n.get(StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag");
featuresC.add(cWord + "-" + c.get(StackedNamedEntityTagAnnotation.class) + '-' + n.get(StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag");
if (flags.usePrev) {
featuresC.add(p.get(StackedNamedEntityTagAnnotation.class) + '-' + c.get(StackedNamedEntityTagAnnotation.class) + '-' + n.get(StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag");
featuresC.add(p.get(StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -" + c.get(StackedNamedEntityTagAnnotation.class)
+ '-' + n.get(StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag");
}
}
if (flags.usePrev) {
featuresC.add(p.get(StackedNamedEntityTagAnnotation.class) + '-' + c.get(StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag");
}
}
if(flags.useWordnetFeatures)
featuresC.add(c.get(WordnetSynAnnotation.class)+"-WordnetSyn");
if(flags.useProtoFeatures)
featuresC.add(c.get(ProtoAnnotation.class)+"-Proto");
return featuresC;
}
/**
* Binary feature annotations
*/
private static class Bin1Annotation implements CoreAnnotation<String> {
public Class<String> getType() { return String.class; } }
private static class Bin2Annotation implements CoreAnnotation<String> {
public Class<String> getType() { return String.class; } }
private static class Bin3Annotation implements CoreAnnotation<String> {
public Class<String> getType() { return String.class; } }
private static class Bin4Annotation implements CoreAnnotation<String> {
public Class<String> getType() { return String.class; } }
private static class Bin5Annotation implements CoreAnnotation<String> {
public Class<String> getType() { return String.class; } }
private static class Bin6Annotation implements CoreAnnotation<String> {
public Class<String> getType() { return String.class; } }
protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel n = cInfo.get(loc + 1);
CoreLabel p = cInfo.get(loc - 1);
String cWord = c.getString(TextAnnotation.class);
String pWord = p.getString(TextAnnotation.class);
String cDS = c.getString(DistSimAnnotation.class);
String pDS = p.getString(DistSimAnnotation.class);
String cShape = c.getString(ShapeAnnotation.class);
String pShape = p.getString(ShapeAnnotation.class);
Collection<String> featuresCpC = new ArrayList<String>();
if (flags.useInternal && flags.useExternal ) {
if (flags.useOrdinal) {
if (isOrdinal(cInfo, loc)) {
featuresCpC.add("C_ORDINAL");
if (isOrdinal(cInfo, loc-1)) {
featuresCpC.add("PC_ORDINAL");
}
}
if (isOrdinal(cInfo, loc-1)) {
featuresCpC.add("P_ORDINAL");
}
}
if (flags.useAbbr || flags.useMinimalAbbr) {
featuresCpC.add(p.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-PABBRANS");
}
if (flags.useAbbr1 || flags.useMinimalAbbr1) {
if (!c.get(AbbrAnnotation.class).equals("XX")) {
featuresCpC.add(p.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-PABBRANS");
}
}
if (flags.useChunkySequences) {
featuresCpC.add(p.get(ChunkAnnotation.class) + '-' + c.get(ChunkAnnotation.class) + '-' + n.get(ChunkAnnotation.class) + "-PCNCHUNK");
}
if (flags.usePrev) {
if (flags.useSequences && flags.usePrevSequences) {
featuresCpC.add("PSEQ");
featuresCpC.add(cWord + "-PSEQW");
featuresCpC.add(pWord+ '-' +cWord + "-PSEQW2");
featuresCpC.add(pWord + "-PSEQpW");
featuresCpC.add(pDS + "-PSEQpDS");
featuresCpC.add(cDS + "-PSEQcDS");
featuresCpC.add(pDS+ '-' +cDS + "-PSEQpcDS");
if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings)) {
featuresCpC.add(pShape + "-PSEQpS");
featuresCpC.add(cShape + "-PSEQcS");
featuresCpC.add(pShape+ '-' +cShape + "-PSEQpcS");
}
}
}
if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
flags.useShapeStrings)
&& flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) {
// String pShape = p.get(ShapeAnnotation.class);
// String cShape = c.get(ShapeAnnotation.class);
if (flags.useTypeSeqs3) {
featuresCpC.add(pShape + '-' + cShape + '-' + n.get(ShapeAnnotation.class) + "-PCNSHAPES");
}
if (flags.useTypeSeqs2) {
featuresCpC.add(pShape + '-' + cShape + "-TYPES");
}
if (flags.useYetMoreCpCShapes) {
String p2Shape = cInfo.get(loc - 2).getString(ShapeAnnotation.class);
featuresCpC.add(p2Shape + '-' + pShape + '-' + cShape + "-YMS");
featuresCpC.add(pShape + '-' + cShape + "-" + n.getString(ShapeAnnotation.class) + "-YMSPCN");
}
}
if (flags.useTypeySequences) {
featuresCpC.add(c.get(ShapeAnnotation.class) + "-TPS2");
featuresCpC.add(n.get(ShapeAnnotation.class) + "-TNS1");
// featuresCpC.add(p.get(ShapeAnnotation.class) + "-" + c.get(ShapeAnnotation.class) + "-TPS"); // duplicates -TYPES, so now omitted; you may need to slighly increase sigma to duplicate previous results, however.
}
if (flags.useTaggySequences) {
if (flags.useTags) {
featuresCpC.add(p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + "-TS");
}
if (flags.useDistSim) {
featuresCpC.add(p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + "-DISTSIM_TS1");
}
}
if (flags.useParenMatching) {
if (flags.useReverse) {
if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) {
if (p.getString(TextAnnotation.class).equals(")") || p.getString(TextAnnotation.class).equals("]") || p.getString(TextAnnotation.class).equals("-RRB-")) {
featuresCpC.add("PAREN-MATCH");
}
}
} else {
if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) {
if (p.getString(TextAnnotation.class).equals("(") || p.getString(TextAnnotation.class).equals("[") || p.getString(TextAnnotation.class).equals("-LRB-")) {
featuresCpC.add("PAREN-MATCH");
}
}
}
}
if (flags.useEntityTypeSequences) {
featuresCpC.add(p.get(EntityTypeAnnotation.class) + '-' + c.get(EntityTypeAnnotation.class) + "-ETSEQ");
}
if (flags.useURLSequences) {
featuresCpC.add(p.get(IsURLAnnotation.class) + '-' + c.get(IsURLAnnotation.class) + "-URLSEQ");
}
} else if (flags.useInternal) {
if (flags.useSequences && flags.usePrevSequences) {
featuresCpC.add("PSEQ");
featuresCpC.add(cWord + "-PSEQW");
}
if (flags.useTypeySequences) {
featuresCpC.add(c.get(ShapeAnnotation.class) + "-TPS2");
}
} else if (flags.useExternal) {
if( ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
flags.useShapeStrings)
&& flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) {
// String pShape = p.get(ShapeAnnotation.class);
// String cShape = c.get(ShapeAnnotation.class);
if (flags.useTypeSeqs3) {
featuresCpC.add(pShape + '-' + cShape + '-' + n.get(ShapeAnnotation.class) + "-PCNSHAPES");
}
if (flags.useTypeSeqs2) {
featuresCpC.add(pShape + '-' + cShape + "-TYPES");
}
}
if (flags.useTypeySequences) {
featuresCpC.add(n.get(ShapeAnnotation.class) + "-TNS1");
featuresCpC.add(p.get(ShapeAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-TPS");
}
}
return featuresCpC;
}
protected Collection<String> featuresCp2C(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
String cWord = c.getString(TextAnnotation.class);
Collection<String> featuresCp2C = new ArrayList<String>();
if (flags.useMoreAbbr) {
featuresCp2C.add(p2.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-P2ABBRANS");
}
if (flags.useMinimalAbbr) {
featuresCp2C.add(p2.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-P2AP2CABB");
}
if (flags.useMinimalAbbr1) {
if (!c.get(AbbrAnnotation.class).equals("XX")) {
featuresCp2C.add(p2.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-P2AP2CABB");
}
}
if (flags.useParenMatching) {
if (flags.useReverse) {
if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) {
if ((p2.getString(TextAnnotation.class).equals(")") || p2.getString(TextAnnotation.class).equals("]") || p2.getString(TextAnnotation.class).equals("-RRB-")) && ! (p.getString(TextAnnotation.class).equals(")") || p.getString(TextAnnotation.class).equals("]") || p.getString(TextAnnotation.class).equals("-RRB-"))) {
featuresCp2C.add("PAREN-MATCH");
}
}
} else {
if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) {
if ((p2.getString(TextAnnotation.class).equals("(") || p2.getString(TextAnnotation.class).equals("[") || p2.getString(TextAnnotation.class).equals("-LRB-")) && ! (p.getString(TextAnnotation.class).equals("(") || p.getString(TextAnnotation.class).equals("[") || p.getString(TextAnnotation.class).equals("-LRB-"))) {
featuresCp2C.add("PAREN-MATCH");
}
}
}
}
return featuresCp2C;
}
protected Collection<String> featuresCp3C(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
String cWord = c.getString(TextAnnotation.class);
Collection<String> featuresCp3C = new ArrayList<String>();
if (flags.useParenMatching) {
if (flags.useReverse) {
if (cWord.equals("(") || cWord.equals("[")) {
if ((flags.maxLeft >= 3) && (p3.getString(TextAnnotation.class).equals(")") || p3.getString(TextAnnotation.class).equals("]")) && !(p2.getString(TextAnnotation.class).equals(")") || p2.getString(TextAnnotation.class).equals("]") || p.getString(TextAnnotation.class).equals(")") || p.getString(TextAnnotation.class).equals("]"))) {
featuresCp3C.add("PAREN-MATCH");
}
}
} else {
if (cWord.equals(")") || cWord.equals("]")) {
if ((flags.maxLeft >= 3) && (p3.getString(TextAnnotation.class).equals("(") || p3.getString(TextAnnotation.class).equals("[")) && !(p2.getString(TextAnnotation.class).equals("(") || p2.getString(TextAnnotation.class).equals("[") || p.getString(TextAnnotation.class).equals("(") || p.getString(TextAnnotation.class).equals("["))) {
featuresCp3C.add("PAREN-MATCH");
}
}
}
}
return featuresCp3C;
}
protected Collection<String> featuresCp4C(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
CoreLabel p4 = cInfo.get(loc - 4);
String cWord = c.getString(TextAnnotation.class);
Collection<String> featuresCp4C = new ArrayList<String>();
if (flags.useParenMatching) {
if (flags.useReverse) {
if (cWord.equals("(") || cWord.equals("[")) {
if ((flags.maxLeft >= 4) && (p4.getString(TextAnnotation.class).equals(")") || p4.getString(TextAnnotation.class).equals("]")) && !(p3.getString(TextAnnotation.class).equals(")") || p3.getString(TextAnnotation.class).equals("]") || p2.getString(TextAnnotation.class).equals(")") || p2.getString(TextAnnotation.class).equals("]") || p.getString(TextAnnotation.class).equals(")") || p.getString(TextAnnotation.class).equals("]"))) {
featuresCp4C.add("PAREN-MATCH");
}
}
} else {
if (cWord.equals(")") || cWord.equals("]")) {
if ((flags.maxLeft >= 4) && (p4.getString(TextAnnotation.class).equals("(") || p4.getString(TextAnnotation.class).equals("[")) && !(p3.getString(TextAnnotation.class).equals("(") || p3.getString(TextAnnotation.class).equals("[") || p2.getString(TextAnnotation.class).equals("(") || p2.getString(TextAnnotation.class).equals("[") || p.getString(TextAnnotation.class).equals("(") || p.getString(TextAnnotation.class).equals("["))) {
featuresCp4C.add("PAREN-MATCH");
}
}
}
}
return featuresCp4C;
}
protected Collection<String> featuresCp5C(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
CoreLabel p4 = cInfo.get(loc - 4);
CoreLabel p5 = cInfo.get(loc - 5);
String cWord = c.getString(TextAnnotation.class);
Collection<String> featuresCp5C = new ArrayList<String>();
if (flags.useParenMatching) {
if (flags.useReverse) {
if (cWord.equals("(") || cWord.equals("[")) {
if ((flags.maxLeft >= 5) && (p5.getString(TextAnnotation.class).equals(")") || p5.getString(TextAnnotation.class).equals("]")) && !(p4.getString(TextAnnotation.class).equals(")") || p4.getString(TextAnnotation.class).equals("]") || p3.getString(TextAnnotation.class).equals(")") || p3.getString(TextAnnotation.class).equals("]") || p2.getString(TextAnnotation.class).equals(")") || p2.getString(TextAnnotation.class).equals("]") || p.getString(TextAnnotation.class).equals(")") || p.getString(TextAnnotation.class).equals("]"))) {
featuresCp5C.add("PAREN-MATCH");
}
}
} else {
if (cWord.equals(")") || cWord.equals("]")) {
if ((flags.maxLeft >= 5) && (p5.getString(TextAnnotation.class).equals("(") || p5.getString(TextAnnotation.class).equals("[")) && !(p4.getString(TextAnnotation.class).equals("(") || p4.getString(TextAnnotation.class).equals("[") || p3.getString(TextAnnotation.class).equals("(") || p3.getString(TextAnnotation.class).equals("[") || p2.getString(TextAnnotation.class).equals("(") || p2.getString(TextAnnotation.class).equals("[") || p.getString(TextAnnotation.class).equals("(") || p.getString(TextAnnotation.class).equals("["))) {
featuresCp5C.add("PAREN-MATCH");
}
}
}
}
return featuresCp5C;
}
protected Collection<String> featuresCpCp2C(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
Collection<String> featuresCpCp2C = new ArrayList<String>();
if (flags.useInternal && flags.useExternal) {
if (false && flags.useTypeySequences && flags.maxLeft >= 2) { // this feature duplicates -TYPETYPES one below, so don't include it (hurts to duplicate)!!!
featuresCpCp2C.add(p2.get(ShapeAnnotation.class) + '-' + p.get(ShapeAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-TTPS");
}
if (flags.useAbbr) {
featuresCpCp2C.add(p2.get(AbbrAnnotation.class) + '-' + p.get(AbbrAnnotation.class) + '-' + c.get(AbbrAnnotation.class) + "-2PABBRANS");
}
if (flags.useChunks) {
featuresCpCp2C.add(p2.get(ChunkAnnotation.class) + '-' + p.get(ChunkAnnotation.class) + '-' + c.get(ChunkAnnotation.class) + "-2PCHUNKS");
}
if (flags.useLongSequences) {
featuresCpCp2C.add("PPSEQ");
}
if (flags.useBoundarySequences && p.getString(TextAnnotation.class).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) {
featuresCpCp2C.add("BNDRY-SPAN-PPSEQ");
}
// This more complex consistency checker didn't help!
// if (flags.useBoundarySequences) {
// String pw = p.getString(TextAnnotation.class);
// // try enforce consistency over "and" and "," as well as boundary now
// if (pw.equals(CoNLLDocumentIteratorFactory.BOUNDARY) ||
// pw.equalsIgnoreCase("and") || pw.equalsIgnoreCase("or") ||
// pw.equals(",")) {
// }
// }
if (flags.useTaggySequences) {
if (flags.useTags) {
featuresCpCp2C.add(p2.getString(PartOfSpeechAnnotation.class) + '-' + p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + "-TTS");
if (flags.useTaggySequencesShapeInteraction) {
featuresCpCp2C.add(p2.getString(PartOfSpeechAnnotation.class) + '-' + p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-TTS-CS");
}
}
if (flags.useDistSim) {
featuresCpCp2C.add(p2.get(DistSimAnnotation.class) + '-' + p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + "-DISTSIM_TTS1");
if (flags.useTaggySequencesShapeInteraction) {
featuresCpCp2C.add(p2.get(DistSimAnnotation.class) + '-' + p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-DISTSIM_TTS1-CS");
}
}
}
if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
flags.useShapeStrings)
&& flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) {
String cShape = c.get(ShapeAnnotation.class);
String pShape = p.get(ShapeAnnotation.class);
String p2Shape = p2.get(ShapeAnnotation.class);
featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES");
}
} else if (flags.useInternal) {
if (flags.useLongSequences) {
featuresCpCp2C.add("PPSEQ");
}
} else if (flags.useExternal) {
if (flags.useLongSequences) {
featuresCpCp2C.add("PPSEQ");
}
if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
flags.useShapeStrings)
&& flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) {
String cShape = c.get(ShapeAnnotation.class);
String pShape = p.get(ShapeAnnotation.class);
String p2Shape = p2.get(ShapeAnnotation.class);
featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES");
}
}
return featuresCpCp2C;
}
protected Collection<String> featuresCpCp2Cp3C(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
Collection<String> featuresCpCp2Cp3C = new ArrayList<String>();
if (flags.useTaggySequences) {
if (flags.useTags) {
if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) {
featuresCpCp2Cp3C.add(p3.getString(PartOfSpeechAnnotation.class) + '-' + p2.getString(PartOfSpeechAnnotation.class) + '-' + p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + "-TTTS");
if (flags.useTaggySequencesShapeInteraction) {
featuresCpCp2Cp3C.add(p3.getString(PartOfSpeechAnnotation.class) + '-' + p2.getString(PartOfSpeechAnnotation.class) + '-' + p.getString(PartOfSpeechAnnotation.class) + '-' + c.getString(PartOfSpeechAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-TTTS-CS");
}
}
}
if (flags.useDistSim) {
if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) {
featuresCpCp2Cp3C.add(p3.get(DistSimAnnotation.class) + '-' + p2.get(DistSimAnnotation.class) + '-' + p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + "-DISTSIM_TTTS1");
if (flags.useTaggySequencesShapeInteraction) {
featuresCpCp2Cp3C.add(p3.get(DistSimAnnotation.class) + '-' + p2.get(DistSimAnnotation.class) + '-' + p.get(DistSimAnnotation.class) + '-' + c.get(DistSimAnnotation.class) + '-' + c.get(ShapeAnnotation.class) + "-DISTSIM_TTTS1-CS");
}
}
}
}
if (flags.maxLeft >= 3) {
if (flags.useLongSequences) {
featuresCpCp2Cp3C.add("PPPSEQ");
}
if (flags.useBoundarySequences && p.getString(TextAnnotation.class).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) {
featuresCpCp2Cp3C.add("BNDRY-SPAN-PPPSEQ");
}
}
return featuresCpCp2Cp3C;
}
protected Collection<String> featuresCpCp2Cp3Cp4C(PaddedList<IN> cInfo, int loc) {
Collection<String> featuresCpCp2Cp3Cp4C = new ArrayList<String>();
CoreLabel p = cInfo.get(loc - 1);
if (flags.maxLeft >= 4) {
if (flags.useLongSequences) {
featuresCpCp2Cp3Cp4C.add("PPPPSEQ");
}
if (flags.useBoundarySequences && p.getString(TextAnnotation.class).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) {
featuresCpCp2Cp3Cp4C.add("BNDRY-SPAN-PPPPSEQ");
}
}
return featuresCpCp2Cp3Cp4C;
}
protected Collection<String> featuresCnC(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
Collection<String> featuresCnC = new ArrayList<String>();
if (flags.useNext) {
if (flags.useSequences && flags.useNextSequences) {
featuresCnC.add("NSEQ");
featuresCnC.add(c.getString(TextAnnotation.class) + "-NSEQW");
}
}
return featuresCnC;
}
protected Collection<String> featuresCpCnC(PaddedList<IN> cInfo, int loc) {
CoreLabel c = cInfo.get(loc);
Collection<String> featuresCpCnC = new ArrayList<String>();
if (flags.useNext && flags.usePrev) {
if (flags.useSequences && flags.usePrevSequences && flags.useNextSequences) {
featuresCpCnC.add("PNSEQ");
featuresCpCnC.add(c.getString(TextAnnotation.class) + "-PNSEQW");
}
}
return featuresCpCnC;
}
int reverse(int i) {
return (flags.useReverse ? -1 * i : i);
}
private Collection<String> occurrencePatterns(PaddedList<IN> cInfo, int loc) {
// features on last Cap
String word = cInfo.get(loc).getString(TextAnnotation.class);
String nWord = cInfo.get(loc + reverse(1)).getString(TextAnnotation.class);
CoreLabel p = cInfo.get(loc - reverse(1));
String pWord = p.getString(TextAnnotation.class);
// System.err.println(word+" "+nWord);
if (!(isNameCase(word) && noUpperCase(nWord) && hasLetter(nWord) && hasLetter(pWord) && p != cInfo.getPad())) {
return Collections.singletonList("NO-OCCURRENCE-PATTERN");
}
// System.err.println("LOOKING");
Set<String> l = new HashSet<String>();
if (cInfo.get(loc - reverse(1)).getString(PartOfSpeechAnnotation.class) != null && isNameCase(pWord) && cInfo.get(loc - reverse(1)).getString(PartOfSpeechAnnotation.class).equals("NNP")) {
for (int jump = 3; jump < 150; jump++) {
if (cInfo.get(loc + reverse(jump)).getString(TextAnnotation.class).equals(word)) {
if (cInfo.get(loc + reverse(jump - 1)).getString(TextAnnotation.class).equals(pWord)) {
l.add("XY-NEXT-OCCURRENCE-XY");
} else {
l.add("XY-NEXT-OCCURRENCE-Y");
}
}
}
for (int jump = -3; jump > -150; jump--) {
if (cInfo.get(loc + reverse(jump)).getString(TextAnnotation.class).equals(word)) {
if (cInfo.get(loc + reverse(jump - 1)).getString(TextAnnotation.class).equals(pWord)) {
l.add("XY-PREV-OCCURRENCE-XY");
} else {
l.add("XY-PREV-OCCURRENCE-Y");
}
}
}
} else {
for (int jump = 3; jump < 150; jump++) {
if (cInfo.get(loc + reverse(jump)).getString(TextAnnotation.class).equals(word)) {
if (isNameCase(cInfo.get(loc + reverse(jump - 1)).getString(TextAnnotation.class)) && (cInfo.get(loc + reverse(jump - 1))).getString(PartOfSpeechAnnotation.class).equals("NNP")) {
l.add("X-NEXT-OCCURRENCE-YX");
// System.err.println(cInfo.get(loc+reverse(jump-1)).getString(TextAnnotation.class));
} else if (isNameCase((cInfo.get(loc + reverse(jump + 1))).getString(TextAnnotation.class)) && (cInfo.get(loc + reverse(jump + 1))).getString(PartOfSpeechAnnotation.class).equals("NNP")) {
// System.err.println(cInfo.get(loc+reverse(jump+1)).getString(TextAnnotation.class));
l.add("X-NEXT-OCCURRENCE-XY");
} else {
l.add("X-NEXT-OCCURRENCE-X");
}
}
}
for (int jump = -3; jump > -150; jump--) {
if (cInfo.get(loc + jump).getString(TextAnnotation.class) != null && cInfo.get(loc + jump).getString(TextAnnotation.class).equals(word)) {
if (isNameCase(cInfo.get(loc + reverse(jump + 1)).getString(TextAnnotation.class)) && (cInfo.get(loc + reverse(jump + 1))).getString(PartOfSpeechAnnotation.class).equals("NNP")) {
l.add("X-PREV-OCCURRENCE-YX");
// System.err.println(cInfo.get(loc+reverse(jump+1)).getString(TextAnnotation.class));
} else if (isNameCase(cInfo.get(loc + reverse(jump - 1)).getString(TextAnnotation.class)) && cInfo.get(loc + reverse(jump - 1)).getString(PartOfSpeechAnnotation.class).equals("NNP")) {
l.add("X-PREV-OCCURRENCE-XY");
// System.err.println(cInfo.get(loc+reverse(jump-1)).getString(TextAnnotation.class));
} else {
l.add("X-PREV-OCCURRENCE-X");
}
}
}
}
/*
if (!l.isEmpty()) {
System.err.println(pWord+" "+word+" "+nWord+" "+l);
}
*/
return l;
}
String intern(String s) {
if (flags.intern) {
return s.intern();
} else {
return s;
}
}
public void initGazette() {
try {
// read in gazettes
if (flags.gazettes == null) { flags.gazettes = new ArrayList<String>(); }
List<String> gazettes = flags.gazettes;
for (String gazetteFile : gazettes) {
BufferedReader r = new BufferedReader(new FileReader(gazetteFile));
readGazette(r);
r.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
} // end class NERFeatureFactory