package edu.stanford.nlp.ling; import java.util.Arrays; import java.util.Comparator; import java.util.Map; import java.util.TreeMap; import java.util.function.Consumer; import edu.stanford.nlp.util.ArrayCoreMap; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; /** * A CoreLabel represents a single word with ancillary information * attached using CoreAnnotations. * A CoreLabel also provides convenient methods to access tags, * lemmas, etc. (if the proper annotations are set). * <p> * A CoreLabel is a Map from keys (which are Class objects) to values, * whose type is determined by the key. That is, it is a heterogeneous * typesafe Map (see Josh Bloch, Effective Java, 2nd edition). * <p> * The CoreLabel class in particular bridges the gap between old-style JavaNLP * Labels and the new CoreMap infrastructure. Instances of this class can be * used (almost) anywhere that the now-defunct FeatureLabel family could be * used. This data structure is backed by an {@link ArrayCoreMap}. * * @author dramage * @author rafferty */ public class CoreLabel extends ArrayCoreMap implements AbstractCoreLabel, HasCategory, HasContext { private static final long serialVersionUID = 2L; // /** // * Should warnings be printed when converting from MapLabel family. // */ // private static final boolean VERBOSE = false; /** Default constructor, calls super() */ public CoreLabel() { super(); } /** * Initializes this CoreLabel, pre-allocating arrays to hold * up to capacity key,value pairs. This array will grow if necessary. * * @param capacity Initial capacity of object in key,value pairs */ public CoreLabel(int capacity) { super(capacity); } /** * Returns a new CoreLabel instance based on the contents of the given * CoreLabel. It copies the contents of the other CoreLabel. * <i>Implementation note:</i> this is a the same as the constructor * that takes a CoreMap, but is needed to ensure unique most specific * type inference for selecting a constructor at compile-time. * * @param label The CoreLabel to copy */ public CoreLabel(CoreLabel label) { this((CoreMap) label); } /** * Returns a new CoreLabel instance based on the contents of the given * CoreMap. It copies the contents of the other CoreMap. * * @param label The CoreMap to copy */ @SuppressWarnings({"unchecked"}) public CoreLabel(CoreMap label) { super(label.size()); Consumer<Class<? extends Key<?>>> savedListener = ArrayCoreMap.listener; // don't listen to the clone operation ArrayCoreMap.listener = null; for (Class key : label.keySet()) { set(key, label.get(key)); } ArrayCoreMap.listener = savedListener; } /** * Returns a new CoreLabel instance based on the contents of the given * label. Warning: The behavior of this method is a bit disjunctive! * If label is a CoreMap (including CoreLabel), then its entire * contents is copied into this label. * If label is an IndexedWord, then the backing label is copied over * entirely. * But, otherwise, just the * value() and word iff it implements {@link HasWord} is copied. * * @param label Basis for this label */ @SuppressWarnings("unchecked") public CoreLabel(Label label) { super(0); if (label instanceof CoreMap) { CoreMap cl = (CoreMap) label; setCapacity(cl.size()); for (Class key : cl.keySet()) { set(key, cl.get(key)); } } else if (label instanceof IndexedWord) { CoreMap cl = ((IndexedWord) label).backingLabel(); setCapacity(cl.size()); for (Class key : cl.keySet()) { set(key, cl.get(key)); } } else { if (label instanceof HasWord) { setWord(((HasWord)label).word()); } setValue(label.value()); } } /** * This constructor attempts to parse the String keys * into Class keys. It's mainly useful for reading from * a file. A best effort attempt is made to correctly * parse the keys according to the String lookup function * in {@link CoreAnnotations}. * * @param keys Array of Strings that are class names * @param values Array of values (as String) */ public CoreLabel(String[] keys, String[] values) { super(keys.length); //this.map = new ArrayCoreMap(); initFromStrings(keys, values); } /** This is provided as a simple way to make a CoreLabel for a word from a String. * It's often useful in fixup or test code. It sets all three of the Text, OriginalText, * and Value annotations to the given value. * * @param word The word string to make a CoreLabel for * @return A CoreLabel for this word string */ public static CoreLabel wordFromString(String word) { CoreLabel cl = new CoreLabel(); cl.setWord(word); cl.setOriginalText(word); cl.setValue(word); return cl; } /** * Class that all "generic" annotations extend. * This allows you to read in arbitrary values from a file as features, for example. */ public interface GenericAnnotation<T> extends CoreAnnotation<T> { } //Unchecked is below because eclipse can't handle the level of type inference if we correctly parametrize GenericAnnotation with String @SuppressWarnings("unchecked") public static final Map<String, Class<? extends GenericAnnotation>> genericKeys = Generics.newHashMap(); @SuppressWarnings("unchecked") public static final Map<Class<? extends GenericAnnotation>, String> genericValues = Generics.newHashMap(); @SuppressWarnings("unchecked") private void initFromStrings(String[] keys, String[] values) { if (keys.length != values.length) { throw new UnsupportedOperationException("Argument array lengths differ: " + Arrays.toString(keys) + " vs. " + Arrays.toString(values)); } for (int i = 0; i < keys.length; i++) { String key = keys[i]; String value = values[i]; Class coreKeyClass = AnnotationLookup.toCoreKey(key); //now work with the key we got above if (coreKeyClass == null) { if (key != null) { throw new UnsupportedOperationException("Unknown key " + key); } // It used to be that the following code let you put unknown keys // in the CoreLabel. However, you can't create classes dynamically // at run time, which meant only one of these classes could ever // exist, which meant multiple unknown keys would clobber each // other and be very annoying. It's easier just to not allow // it at all. // If it becomes possible to create classes dynamically, // we could add this code back. //if(genericKeys.containsKey(key)) { // this.set(genericKeys.get(key), value); //} else { // GenericAnnotation<String> newKey = new GenericAnnotation<String>() { // public Class<String> getType() { return String.class;} }; // this.set(newKey.getClass(), values[i]); // genericKeys.put(keys[i], newKey.getClass()); // genericValues.put(newKey.getClass(), keys[i]); //} // unknown key; ignore //if (VERBOSE) { // log.info("CORE: CoreLabel.fromAbstractMapLabel: " + // "Unknown key "+key); //} } else { try { Class<?> valueClass = AnnotationLookup.getValueType(coreKeyClass); if(valueClass.equals(String.class)) { this.set(coreKeyClass, values[i]); } else if(valueClass == Integer.class) { this.set(coreKeyClass, Integer.parseInt(values[i])); } else if(valueClass == Double.class) { this.set(coreKeyClass, Double.parseDouble(values[i])); } else if(valueClass == Long.class) { this.set(coreKeyClass, Long.parseLong(values[i])); } else { throw new RuntimeException("Can't handle " + valueClass); } } catch (Exception e) { // unexpected value type throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: " + "Bad type for " + key + ". Value was: " + value + "; expected "+AnnotationLookup.getValueType(coreKeyClass), e); } } } } private static class CoreLabelFactory implements LabelFactory { @Override public Label newLabel(String labelStr) { CoreLabel label = new CoreLabel(); label.setValue(labelStr); return label; } @Override public Label newLabel(String labelStr, int options) { return newLabel(labelStr); } @Override public Label newLabel(Label oldLabel) { if (oldLabel instanceof CoreLabel) { return new CoreLabel((CoreLabel)oldLabel); } else { //Map the old interfaces to the correct key/value pairs //Don't need to worry about HasIndex, which doesn't appear in any legacy code CoreLabel label = new CoreLabel(); if (oldLabel instanceof HasWord) label.setWord(((HasWord) oldLabel).word()); if (oldLabel instanceof HasTag) label.setTag(((HasTag) oldLabel).tag()); if (oldLabel instanceof HasOffset) { label.setBeginPosition(((HasOffset) oldLabel).beginPosition()); label.setEndPosition(((HasOffset) oldLabel).endPosition()); } if (oldLabel instanceof HasCategory) label.setCategory(((HasCategory) oldLabel).category()); if (oldLabel instanceof HasIndex) label.setIndex(((HasIndex) oldLabel).index()); label.setValue(oldLabel.value()); return label; } } @Override public Label newLabelFromString(String encodedLabelStr) { throw new UnsupportedOperationException("This code branch left blank" + " because we do not understand what this method should do."); } } /** * Return a factory for this kind of label * * @return The label factory */ public static LabelFactory factory() { return new CoreLabelFactory(); } /** * {@inheritDoc} */ @Override public LabelFactory labelFactory() { return CoreLabel.factory(); } /** * {@inheritDoc} */ @Override public <KEY extends Key<String>> String getString(Class<KEY> key) { return this.getString(key, ""); } @Override public <KEY extends Key<String>> String getString(Class<KEY> key, String def) { String value = get(key); if (value == null) { return def; } return value; } /** * {@inheritDoc} */ @Override public void setFromString(String labelStr) { throw new UnsupportedOperationException("Cannot set from string"); } /** * {@inheritDoc} */ @Override public final void setValue(String value) { set(CoreAnnotations.ValueAnnotation.class, value); } /** * {@inheritDoc} */ @Override public final String value() { return get(CoreAnnotations.ValueAnnotation.class); } /** * Set the word value for the label. Also, clears the lemma, since * that may have changed if the word changed. */ @Override public void setWord(String word) { String originalWord = get(CoreAnnotations.TextAnnotation.class); set(CoreAnnotations.TextAnnotation.class, word); // Pado feb 09: if you change the word, delete the lemma. // Gabor dec 2012: check if there was a real change -- this remove is actually rather expensive if it gets called a lot // todo [cdm 2015]: probably no one now knows why this was even needed, but maybe it should just be removed. It's kind of weird. if (word != null && !word.equals(originalWord) && containsKey(CoreAnnotations.LemmaAnnotation.class)) { remove(CoreAnnotations.LemmaAnnotation.class); } } /** * {@inheritDoc} */ @Override public String word() { return get(CoreAnnotations.TextAnnotation.class); } /** * {@inheritDoc} */ @Override public void setTag(String tag) { set(CoreAnnotations.PartOfSpeechAnnotation.class, tag); } /** * {@inheritDoc} */ @Override public String tag() { return get(CoreAnnotations.PartOfSpeechAnnotation.class); } /** * {@inheritDoc} */ @Override public void setCategory(String category) { set(CoreAnnotations.CategoryAnnotation.class, category); } /** * {@inheritDoc} */ @Override public String category() { return get(CoreAnnotations.CategoryAnnotation.class); } /** * {@inheritDoc} */ @Override public void setAfter(String after) { set(CoreAnnotations.AfterAnnotation.class, after); } /** * {@inheritDoc} */ @Override public String after() { return getString(CoreAnnotations.AfterAnnotation.class); } /** * {@inheritDoc} */ @Override public void setBefore(String before) { set(CoreAnnotations.BeforeAnnotation.class, before); } /** * {@inheritDoc} */ @Override public String before() { return getString(CoreAnnotations.BeforeAnnotation.class); } /** * {@inheritDoc} */ @Override public void setOriginalText(String originalText) { set(CoreAnnotations.OriginalTextAnnotation.class, originalText); } /** * {@inheritDoc} */ @Override public String originalText() { return getString(CoreAnnotations.OriginalTextAnnotation.class); } /** * {@inheritDoc} */ @Override public String docID() { return get(CoreAnnotations.DocIDAnnotation.class); } /** * {@inheritDoc} */ @Override public void setDocID(String docID) { set(CoreAnnotations.DocIDAnnotation.class, docID); } /** * {@inheritDoc} */ @Override public String ner() { return get(CoreAnnotations.NamedEntityTagAnnotation.class); } /** * {@inheritDoc} */ @Override public void setNER(String ner) { set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } /** * {@inheritDoc} */ @Override public String lemma() { return get(CoreAnnotations.LemmaAnnotation.class); } /** * {@inheritDoc} */ @Override public void setLemma(String lemma) { set(CoreAnnotations.LemmaAnnotation.class, lemma); } /** * {@inheritDoc} */ @Override public int index() { Integer n = get(CoreAnnotations.IndexAnnotation.class); if(n == null) return -1; return n; } /** * {@inheritDoc} */ @Override public void setIndex(int index) { set(CoreAnnotations.IndexAnnotation.class, index); } /** * {@inheritDoc} */ @Override public int sentIndex() { Integer n = get(CoreAnnotations.SentenceIndexAnnotation.class); if(n == null) return -1; return n; } /** * {@inheritDoc} */ @Override public void setSentIndex(int sentIndex) { set(CoreAnnotations.SentenceIndexAnnotation.class, sentIndex); } /** * {@inheritDoc} */ @Override public int beginPosition() { Integer i = get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); if(i != null) return i; return -1; } /** * {@inheritDoc} */ @Override public int endPosition() { Integer i = get(CoreAnnotations.CharacterOffsetEndAnnotation.class); if(i != null) return i; return -1; } /** * {@inheritDoc} */ @Override public void setBeginPosition(int beginPos) { set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, beginPos); } /** * {@inheritDoc} */ @Override public void setEndPosition(int endPos) { set(CoreAnnotations.CharacterOffsetEndAnnotation.class, endPos); } /** * Tag separator to use by default */ public static final String TAG_SEPARATOR = "/"; public enum OutputFormat { VALUE_INDEX, VALUE, VALUE_TAG, VALUE_TAG_INDEX, MAP, VALUE_MAP, VALUE_INDEX_MAP, WORD, WORD_INDEX, VALUE_TAG_NER, LEMMA_INDEX, ALL } public static final OutputFormat DEFAULT_FORMAT = OutputFormat.VALUE_INDEX; @Override public String toString() { return toString(DEFAULT_FORMAT); } /** * Returns a formatted string representing this label. The * desired format is passed in as a {@code String}. * Currently supported formats include: * <ul> * <li>"value": just prints the value</li> * <li>"{map}": prints the complete map</li> * <li>"value{map}": prints the value followed by the contained * map (less the map entry containing key {@code CATEGORY_KEY})</li> * <li>"value-index": extracts a value and an integer index from * the contained map using keys {@code INDEX_KEY}, * respectively, and prints them with a hyphen in between</li> * <li>"value-tag" * <li>"value-tag-index" * <li>"value-index{map}": a combination of the above; the index is * displayed first and then not shown in the map that is displayed</li> * <li>"word": Just the value of HEAD_WORD_KEY in the map</li> * </ul> * <p/> * Map is printed in alphabetical order of keys. */ @SuppressWarnings("unchecked") public String toString(OutputFormat format) { StringBuilder buf = new StringBuilder(); switch(format) { case VALUE: buf.append(value()); break; case MAP: { Map map2 = new TreeMap(); for(Class key : this.keySet()) { map2.put(key.getName(), get(key)); } buf.append(map2); break; } case VALUE_MAP: { buf.append(value()); Map map2 = new TreeMap(asClassComparator); for(Class key : this.keySet()) { map2.put(key, get(key)); } map2.remove(CoreAnnotations.ValueAnnotation.class); buf.append(map2); break; } case VALUE_INDEX: { buf.append(value()); Integer index = this.get(CoreAnnotations.IndexAnnotation.class); if (index != null) { buf.append('-').append((index).intValue()); } break; } case VALUE_TAG: { buf.append(value()); String tag = tag(); if (tag != null) { buf.append(TAG_SEPARATOR).append(tag); } break; } case VALUE_TAG_INDEX: { buf.append(value()); String tag = tag(); if (tag != null) { buf.append(TAG_SEPARATOR).append(tag); } Integer index = this.get(CoreAnnotations.IndexAnnotation.class); if (index != null) { buf.append('-').append((index).intValue()); } break; } case VALUE_INDEX_MAP: { buf.append(value()); Integer index = this.get(CoreAnnotations.IndexAnnotation.class); if (index != null) { buf.append('-').append((index).intValue()); } Map<String,Object> map2 = new TreeMap<>(); for(Class key : this.keySet()) { String cls = key.getName(); // special shortening of all the Annotation classes int idx = cls.indexOf('$'); if (idx >= 0) { cls = cls.substring(idx + 1); } map2.put(cls, this.get(key)); } map2.remove("IndexAnnotation"); map2.remove("ValueAnnotation"); if (!map2.isEmpty()) { buf.append(map2); } break; } case WORD: // TODO: maybe we should unify word() and value(). [cdm 2015] I think not, rather maybe remove value and redefine category. buf.append(word()); break; case WORD_INDEX: { buf.append(this.get(CoreAnnotations.TextAnnotation.class)); Integer index = this.get(CoreAnnotations.IndexAnnotation.class); if (index != null) { buf.append('-').append((index).intValue()); } break; } case VALUE_TAG_NER:{ buf.append(value()); String tag = tag(); if (tag != null) { buf.append(TAG_SEPARATOR).append(tag); } if(ner() != null){ buf.append(TAG_SEPARATOR).append(ner()); } break; } case LEMMA_INDEX: buf.append(lemma()); Integer index = this.get(CoreAnnotations.IndexAnnotation.class); if (index != null) { buf.append('-').append((index).intValue()); } break; case ALL:{ for(Class en: this.keySet()){ buf.append(';').append(en).append(':').append(this.get(en)); } break; } default: throw new IllegalArgumentException("Unknown format " + format); } return buf.toString(); } private static final Comparator<Class<?>> asClassComparator = (o1, o2) -> o1.getName().compareTo(o2.getName()); }