package edu.stanford.nlp.ling;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.TreeMap;
import java.util.function.Consumer;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
/**
* A CoreLabel represents a single word with ancillary information
* attached using CoreAnnotations.
* A CoreLabel also provides convenient methods to access tags,
* lemmas, etc. (if the proper annotations are set).
* <p>
* A CoreLabel is a Map from keys (which are Class objects) to values,
* whose type is determined by the key. That is, it is a heterogeneous
* typesafe Map (see Josh Bloch, Effective Java, 2nd edition).
* <p>
* The CoreLabel class in particular bridges the gap between old-style JavaNLP
* Labels and the new CoreMap infrastructure. Instances of this class can be
* used (almost) anywhere that the now-defunct FeatureLabel family could be
* used. This data structure is backed by an {@link ArrayCoreMap}.
*
* @author dramage
* @author rafferty
*/
public class CoreLabel extends ArrayCoreMap implements AbstractCoreLabel, HasCategory, HasContext {
private static final long serialVersionUID = 2L;
// /**
// * Should warnings be printed when converting from MapLabel family.
// */
// private static final boolean VERBOSE = false;
/** Default constructor, calls super() */
public CoreLabel() {
super();
}
/**
* Initializes this CoreLabel, pre-allocating arrays to hold
* up to capacity key,value pairs. This array will grow if necessary.
*
* @param capacity Initial capacity of object in key,value pairs
*/
public CoreLabel(int capacity) {
super(capacity);
}
/**
* Returns a new CoreLabel instance based on the contents of the given
* CoreLabel. It copies the contents of the other CoreLabel.
* <i>Implementation note:</i> this is a the same as the constructor
* that takes a CoreMap, but is needed to ensure unique most specific
* type inference for selecting a constructor at compile-time.
*
* @param label The CoreLabel to copy
*/
public CoreLabel(CoreLabel label) {
this((CoreMap) label);
}
/**
* Returns a new CoreLabel instance based on the contents of the given
* CoreMap. It copies the contents of the other CoreMap.
*
* @param label The CoreMap to copy
*/
@SuppressWarnings({"unchecked"})
public CoreLabel(CoreMap label) {
super(label.size());
Consumer<Class<? extends Key<?>>> savedListener = ArrayCoreMap.listener; // don't listen to the clone operation
ArrayCoreMap.listener = null;
for (Class key : label.keySet()) {
set(key, label.get(key));
}
ArrayCoreMap.listener = savedListener;
}
/**
* Returns a new CoreLabel instance based on the contents of the given
* label. Warning: The behavior of this method is a bit disjunctive!
* If label is a CoreMap (including CoreLabel), then its entire
* contents is copied into this label.
* If label is an IndexedWord, then the backing label is copied over
* entirely.
* But, otherwise, just the
* value() and word iff it implements {@link HasWord} is copied.
*
* @param label Basis for this label
*/
@SuppressWarnings("unchecked")
public CoreLabel(Label label) {
super(0);
if (label instanceof CoreMap) {
CoreMap cl = (CoreMap) label;
setCapacity(cl.size());
for (Class key : cl.keySet()) {
set(key, cl.get(key));
}
} else if (label instanceof IndexedWord) {
CoreMap cl = ((IndexedWord) label).backingLabel();
setCapacity(cl.size());
for (Class key : cl.keySet()) {
set(key, cl.get(key));
}
} else {
if (label instanceof HasWord) {
setWord(((HasWord)label).word());
}
setValue(label.value());
}
}
/**
* This constructor attempts to parse the String keys
* into Class keys. It's mainly useful for reading from
* a file. A best effort attempt is made to correctly
* parse the keys according to the String lookup function
* in {@link CoreAnnotations}.
*
* @param keys Array of Strings that are class names
* @param values Array of values (as String)
*/
public CoreLabel(String[] keys, String[] values) {
super(keys.length);
//this.map = new ArrayCoreMap();
initFromStrings(keys, values);
}
/** This is provided as a simple way to make a CoreLabel for a word from a String.
* It's often useful in fixup or test code. It sets all three of the Text, OriginalText,
* and Value annotations to the given value.
*
* @param word The word string to make a CoreLabel for
* @return A CoreLabel for this word string
*/
public static CoreLabel wordFromString(String word) {
CoreLabel cl = new CoreLabel();
cl.setWord(word);
cl.setOriginalText(word);
cl.setValue(word);
return cl;
}
/**
* Class that all "generic" annotations extend.
* This allows you to read in arbitrary values from a file as features, for example.
*/
public interface GenericAnnotation<T> extends CoreAnnotation<T> { }
//Unchecked is below because eclipse can't handle the level of type inference if we correctly parametrize GenericAnnotation with String
@SuppressWarnings("unchecked")
public static final Map<String, Class<? extends GenericAnnotation>> genericKeys = Generics.newHashMap();
@SuppressWarnings("unchecked")
public static final Map<Class<? extends GenericAnnotation>, String> genericValues = Generics.newHashMap();
@SuppressWarnings("unchecked")
private void initFromStrings(String[] keys, String[] values) {
if (keys.length != values.length) {
throw new UnsupportedOperationException("Argument array lengths differ: " +
Arrays.toString(keys) + " vs. " + Arrays.toString(values));
}
for (int i = 0; i < keys.length; i++) {
String key = keys[i];
String value = values[i];
Class coreKeyClass = AnnotationLookup.toCoreKey(key);
//now work with the key we got above
if (coreKeyClass == null) {
if (key != null) {
throw new UnsupportedOperationException("Unknown key " + key);
}
// It used to be that the following code let you put unknown keys
// in the CoreLabel. However, you can't create classes dynamically
// at run time, which meant only one of these classes could ever
// exist, which meant multiple unknown keys would clobber each
// other and be very annoying. It's easier just to not allow
// it at all.
// If it becomes possible to create classes dynamically,
// we could add this code back.
//if(genericKeys.containsKey(key)) {
// this.set(genericKeys.get(key), value);
//} else {
// GenericAnnotation<String> newKey = new GenericAnnotation<String>() {
// public Class<String> getType() { return String.class;} };
// this.set(newKey.getClass(), values[i]);
// genericKeys.put(keys[i], newKey.getClass());
// genericValues.put(newKey.getClass(), keys[i]);
//}
// unknown key; ignore
//if (VERBOSE) {
// log.info("CORE: CoreLabel.fromAbstractMapLabel: " +
// "Unknown key "+key);
//}
} else {
try {
Class<?> valueClass = AnnotationLookup.getValueType(coreKeyClass);
if(valueClass.equals(String.class)) {
this.set(coreKeyClass, values[i]);
} else if(valueClass == Integer.class) {
this.set(coreKeyClass, Integer.parseInt(values[i]));
} else if(valueClass == Double.class) {
this.set(coreKeyClass, Double.parseDouble(values[i]));
} else if(valueClass == Long.class) {
this.set(coreKeyClass, Long.parseLong(values[i]));
} else {
throw new RuntimeException("Can't handle " + valueClass);
}
} catch (Exception e) {
// unexpected value type
throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: "
+ "Bad type for " + key
+ ". Value was: " + value
+ "; expected "+AnnotationLookup.getValueType(coreKeyClass), e);
}
}
}
}
private static class CoreLabelFactory implements LabelFactory {
@Override
public Label newLabel(String labelStr) {
CoreLabel label = new CoreLabel();
label.setValue(labelStr);
return label;
}
@Override
public Label newLabel(String labelStr, int options) {
return newLabel(labelStr);
}
@Override
public Label newLabel(Label oldLabel) {
if (oldLabel instanceof CoreLabel) {
return new CoreLabel((CoreLabel)oldLabel);
} else {
//Map the old interfaces to the correct key/value pairs
//Don't need to worry about HasIndex, which doesn't appear in any legacy code
CoreLabel label = new CoreLabel();
if (oldLabel instanceof HasWord)
label.setWord(((HasWord) oldLabel).word());
if (oldLabel instanceof HasTag)
label.setTag(((HasTag) oldLabel).tag());
if (oldLabel instanceof HasOffset) {
label.setBeginPosition(((HasOffset) oldLabel).beginPosition());
label.setEndPosition(((HasOffset) oldLabel).endPosition());
}
if (oldLabel instanceof HasCategory)
label.setCategory(((HasCategory) oldLabel).category());
if (oldLabel instanceof HasIndex)
label.setIndex(((HasIndex) oldLabel).index());
label.setValue(oldLabel.value());
return label;
}
}
@Override
public Label newLabelFromString(String encodedLabelStr) {
throw new UnsupportedOperationException("This code branch left blank" +
" because we do not understand what this method should do.");
}
}
/**
* Return a factory for this kind of label
*
* @return The label factory
*/
public static LabelFactory factory() {
return new CoreLabelFactory();
}
/**
* {@inheritDoc}
*/
@Override
public LabelFactory labelFactory() {
return CoreLabel.factory();
}
/**
* {@inheritDoc}
*/
@Override
public <KEY extends Key<String>> String getString(Class<KEY> key) {
return this.getString(key, "");
}
@Override
public <KEY extends Key<String>> String getString(Class<KEY> key, String def) {
String value = get(key);
if (value == null) {
return def;
}
return value;
}
/**
* {@inheritDoc}
*/
@Override
public void setFromString(String labelStr) {
throw new UnsupportedOperationException("Cannot set from string");
}
/**
* {@inheritDoc}
*/
@Override
public final void setValue(String value) {
set(CoreAnnotations.ValueAnnotation.class, value);
}
/**
* {@inheritDoc}
*/
@Override
public final String value() {
return get(CoreAnnotations.ValueAnnotation.class);
}
/**
* Set the word value for the label. Also, clears the lemma, since
* that may have changed if the word changed.
*/
@Override
public void setWord(String word) {
String originalWord = get(CoreAnnotations.TextAnnotation.class);
set(CoreAnnotations.TextAnnotation.class, word);
// Pado feb 09: if you change the word, delete the lemma.
// Gabor dec 2012: check if there was a real change -- this remove is actually rather expensive if it gets called a lot
// todo [cdm 2015]: probably no one now knows why this was even needed, but maybe it should just be removed. It's kind of weird.
if (word != null && !word.equals(originalWord) && containsKey(CoreAnnotations.LemmaAnnotation.class)) {
remove(CoreAnnotations.LemmaAnnotation.class);
}
}
/**
* {@inheritDoc}
*/
@Override
public String word() {
return get(CoreAnnotations.TextAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setTag(String tag) {
set(CoreAnnotations.PartOfSpeechAnnotation.class, tag);
}
/**
* {@inheritDoc}
*/
@Override
public String tag() {
return get(CoreAnnotations.PartOfSpeechAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setCategory(String category) {
set(CoreAnnotations.CategoryAnnotation.class, category);
}
/**
* {@inheritDoc}
*/
@Override
public String category() {
return get(CoreAnnotations.CategoryAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setAfter(String after) {
set(CoreAnnotations.AfterAnnotation.class, after);
}
/**
* {@inheritDoc}
*/
@Override
public String after() {
return getString(CoreAnnotations.AfterAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setBefore(String before) {
set(CoreAnnotations.BeforeAnnotation.class, before);
}
/**
* {@inheritDoc}
*/
@Override
public String before() {
return getString(CoreAnnotations.BeforeAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setOriginalText(String originalText) {
set(CoreAnnotations.OriginalTextAnnotation.class, originalText);
}
/**
* {@inheritDoc}
*/
@Override
public String originalText() {
return getString(CoreAnnotations.OriginalTextAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public String docID() {
return get(CoreAnnotations.DocIDAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setDocID(String docID) {
set(CoreAnnotations.DocIDAnnotation.class, docID);
}
/**
* {@inheritDoc}
*/
@Override
public String ner() {
return get(CoreAnnotations.NamedEntityTagAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setNER(String ner) {
set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
}
/**
* {@inheritDoc}
*/
@Override
public String lemma() {
return get(CoreAnnotations.LemmaAnnotation.class);
}
/**
* {@inheritDoc}
*/
@Override
public void setLemma(String lemma) {
set(CoreAnnotations.LemmaAnnotation.class, lemma);
}
/**
* {@inheritDoc}
*/
@Override
public int index() {
Integer n = get(CoreAnnotations.IndexAnnotation.class);
if(n == null)
return -1;
return n;
}
/**
* {@inheritDoc}
*/
@Override
public void setIndex(int index) {
set(CoreAnnotations.IndexAnnotation.class, index);
}
/**
* {@inheritDoc}
*/
@Override
public int sentIndex() {
Integer n = get(CoreAnnotations.SentenceIndexAnnotation.class);
if(n == null)
return -1;
return n;
}
/**
* {@inheritDoc}
*/
@Override
public void setSentIndex(int sentIndex) {
set(CoreAnnotations.SentenceIndexAnnotation.class, sentIndex);
}
/**
* {@inheritDoc}
*/
@Override
public int beginPosition() {
Integer i = get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
if(i != null) return i;
return -1;
}
/**
* {@inheritDoc}
*/
@Override
public int endPosition() {
Integer i = get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
if(i != null) return i;
return -1;
}
/**
* {@inheritDoc}
*/
@Override
public void setBeginPosition(int beginPos) {
set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, beginPos);
}
/**
* {@inheritDoc}
*/
@Override
public void setEndPosition(int endPos) {
set(CoreAnnotations.CharacterOffsetEndAnnotation.class, endPos);
}
/**
* Tag separator to use by default
*/
public static final String TAG_SEPARATOR = "/";
public enum OutputFormat {
VALUE_INDEX, VALUE, VALUE_TAG, VALUE_TAG_INDEX, MAP, VALUE_MAP, VALUE_INDEX_MAP, WORD, WORD_INDEX, VALUE_TAG_NER, LEMMA_INDEX, ALL
}
public static final OutputFormat DEFAULT_FORMAT = OutputFormat.VALUE_INDEX;
@Override
public String toString() {
return toString(DEFAULT_FORMAT);
}
/**
* Returns a formatted string representing this label. The
* desired format is passed in as a {@code String}.
* Currently supported formats include:
* <ul>
* <li>"value": just prints the value</li>
* <li>"{map}": prints the complete map</li>
* <li>"value{map}": prints the value followed by the contained
* map (less the map entry containing key {@code CATEGORY_KEY})</li>
* <li>"value-index": extracts a value and an integer index from
* the contained map using keys {@code INDEX_KEY},
* respectively, and prints them with a hyphen in between</li>
* <li>"value-tag"
* <li>"value-tag-index"
* <li>"value-index{map}": a combination of the above; the index is
* displayed first and then not shown in the map that is displayed</li>
* <li>"word": Just the value of HEAD_WORD_KEY in the map</li>
* </ul>
* <p/>
* Map is printed in alphabetical order of keys.
*/
@SuppressWarnings("unchecked")
public String toString(OutputFormat format) {
StringBuilder buf = new StringBuilder();
switch(format) {
case VALUE:
buf.append(value());
break;
case MAP: {
Map map2 = new TreeMap();
for(Class key : this.keySet()) {
map2.put(key.getName(), get(key));
}
buf.append(map2);
break;
}
case VALUE_MAP: {
buf.append(value());
Map map2 = new TreeMap(asClassComparator);
for(Class key : this.keySet()) {
map2.put(key, get(key));
}
map2.remove(CoreAnnotations.ValueAnnotation.class);
buf.append(map2);
break;
}
case VALUE_INDEX: {
buf.append(value());
Integer index = this.get(CoreAnnotations.IndexAnnotation.class);
if (index != null) {
buf.append('-').append((index).intValue());
}
break;
}
case VALUE_TAG: {
buf.append(value());
String tag = tag();
if (tag != null) {
buf.append(TAG_SEPARATOR).append(tag);
}
break;
}
case VALUE_TAG_INDEX: {
buf.append(value());
String tag = tag();
if (tag != null) {
buf.append(TAG_SEPARATOR).append(tag);
}
Integer index = this.get(CoreAnnotations.IndexAnnotation.class);
if (index != null) {
buf.append('-').append((index).intValue());
}
break;
}
case VALUE_INDEX_MAP: {
buf.append(value());
Integer index = this.get(CoreAnnotations.IndexAnnotation.class);
if (index != null) {
buf.append('-').append((index).intValue());
}
Map<String,Object> map2 = new TreeMap<>();
for(Class key : this.keySet()) {
String cls = key.getName();
// special shortening of all the Annotation classes
int idx = cls.indexOf('$');
if (idx >= 0) {
cls = cls.substring(idx + 1);
}
map2.put(cls, this.get(key));
}
map2.remove("IndexAnnotation");
map2.remove("ValueAnnotation");
if (!map2.isEmpty()) {
buf.append(map2);
}
break;
}
case WORD:
// TODO: maybe we should unify word() and value(). [cdm 2015] I think not, rather maybe remove value and redefine category.
buf.append(word());
break;
case WORD_INDEX: {
buf.append(this.get(CoreAnnotations.TextAnnotation.class));
Integer index = this.get(CoreAnnotations.IndexAnnotation.class);
if (index != null) {
buf.append('-').append((index).intValue());
}
break;
}
case VALUE_TAG_NER:{
buf.append(value());
String tag = tag();
if (tag != null) {
buf.append(TAG_SEPARATOR).append(tag);
}
if(ner() != null){
buf.append(TAG_SEPARATOR).append(ner());
}
break;
}
case LEMMA_INDEX:
buf.append(lemma());
Integer index = this.get(CoreAnnotations.IndexAnnotation.class);
if (index != null) {
buf.append('-').append((index).intValue());
}
break;
case ALL:{
for(Class en: this.keySet()){
buf.append(';').append(en).append(':').append(this.get(en));
}
break;
}
default:
throw new IllegalArgumentException("Unknown format " + format);
}
return buf.toString();
}
private static final Comparator<Class<?>> asClassComparator =
(o1, o2) -> o1.getName().compareTo(o2.getName());
}