package edu.stanford.nlp.sequences; import edu.stanford.nlp.util.logging.Redwood; import java.util.List; import java.util.Locale; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.PaddedList; import edu.stanford.nlp.util.TypesafeMap; /** A static class with functions to convert lists of tokens between * different IOB-style representations. * * @author Christopher Manning */ public class IOBUtils { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(IOBUtils.class); private IOBUtils() {} // static methods /** * This can be used to map from any IOB-style (i.e., "I-PERS" style labels) * or just categories representation to any other. * It can read and change any representation to other representations: * a 4 way representation of all entities, like S-PERS, B-PERS, * I-PERS, E-PERS for single word, beginning, internal, and end of entity * (IOBES or SBIEO); always marking the first word of an entity (IOB2 or BIO); * only marking specially the beginning of non-first * items of an entity sequences with B-PERS (IOB1); * the reverse IOE1 and IOE2; IO where everything is I-tagged; and * NOPREFIX, where no prefixes are written on category labels. * The last two representations are deficient in not allowing adjacent * entities of the same class to be represented, but nevertheless * convenient. Note that the background label is never given a prefix. * This code is very specific to the particular CoNLL way of labeling * classes for IOB-style encoding, but this notation is quite widespread. * It will work on any of these styles of input. * This will also recognize BILOU format (B=B, I=I, L=E, O=O, U=S). * It also works with lowercased names like i-org. * If the labels are not of the form "C-Y+", where C is a single character, * then they will be regarded as NOPREFIX labels. * This method updates the List tokens in place. * * @param tokens List of tokens (each a CoreLabel) in some style * @param key The key in the CoreLabel to change, commonly CoreAnnotations.AnswerAnnotation.class * @param backgroundLabel The background label, which gets special treatment * @param style Output style; one of iob[12], ioe[12], io, sbieo/iobes, noprefix * @param intern Whether to String-intern the new labels (may as well, small number!) */ @SuppressWarnings("StringContatenationInLoop") public static <TOK extends CoreMap> void entitySubclassify(List<TOK> tokens, Class<? extends TypesafeMap.Key<String>> key, String backgroundLabel, String style, boolean intern) { int how; String lowerStyle = style.toLowerCase(Locale.ENGLISH); switch (lowerStyle) { case "iob1": how = 0; break; case "iob2": case "bio": how = 1; break; case "ioe1": how = 2; break; case "ioe2": how = 3; break; case "io": how = 4; break; case "sbieo": case "iobes": how = 5; break; case "noprefix": how = 6; break; case "bilou": how = 7; break; default: throw new IllegalArgumentException("entitySubclassify: unknown style: " + style); } List<TOK> paddedTokens = new PaddedList<>(tokens, (TOK) new CoreLabel()); int size = paddedTokens.size(); String[] newAnswers = new String[size]; for (int i = 0; i < size; i++) { TOK c = paddedTokens.get(i); TOK p = paddedTokens.get(i - 1); TOK n = paddedTokens.get(i + 1); String cAns = c.get(key); String pAns = p.get(key); if (pAns == null) { pAns = backgroundLabel; } String nAns = n.get(key); if (nAns == null) { nAns = backgroundLabel; } String base; char prefix; if (cAns.length() > 2 && cAns.charAt(1) == '-') { base = cAns.substring(2, cAns.length()); prefix = Character.toUpperCase(cAns.charAt(0)); } else { base = cAns; prefix = ' '; } String pBase; char pPrefix; if (pAns.length() > 2 && pAns.charAt(1) == '-') { pBase = pAns.substring(2, pAns.length()); pPrefix = Character.toUpperCase(pAns.charAt(0)); } else { pBase = pAns; pPrefix = ' '; } String nBase; char nPrefix; if (nAns.length() > 2 && nAns.charAt(1) == '-') { nBase = nAns.substring(2, nAns.length()); nPrefix = Character.toUpperCase(nAns.charAt(0)); } else { nBase = nAns; nPrefix = ' '; } boolean isStartAdjacentSame = isSameEntityBoundary(pBase, pPrefix, base, prefix); boolean isEndAdjacentSame = isSameEntityBoundary(base, prefix, nBase, nPrefix); boolean isFirst = isDifferentEntityBoundary(pBase, base) || isStartAdjacentSame; boolean isLast = isDifferentEntityBoundary(base, nBase) || isEndAdjacentSame; String newAnswer = base; if ( ! base.equals(backgroundLabel)) { switch (how) { case 0: // iob1, only B if adjacent if (isStartAdjacentSame) { newAnswer = "B-" + base; } else { newAnswer = "I-" + base; } break; case 1: // iob2 always B at start if (isFirst) { newAnswer = "B-" + base; } else { newAnswer = "I-" + base; } break; case 2: // ioe1 if (isEndAdjacentSame) { newAnswer = "E-" + base; } else { newAnswer = "I-" + base; } break; case 3: // ioe2 if (isLast) { newAnswer = "E-" + base; } else { newAnswer = "I-" + base; } break; case 4: newAnswer = "I-" + base; break; case 5: if (isFirst && isLast) { newAnswer = "S-" + base; } else if ( ( ! isFirst) && isLast) { newAnswer = "E-" + base; } else if (isFirst && ( ! isLast)) { newAnswer = "B-" + base; } else { newAnswer = "I-" + base; } break; // nothing to do on case 6 as it's just base case 7: if (isFirst && isLast) { newAnswer = "U-" + base; } else if ( ( ! isFirst) && isLast) { newAnswer = "L-" + base; } else if (isFirst && ( ! isLast)) { newAnswer = "B-" + base; } else { newAnswer = "I-" + base; } } } if (intern) { newAnswer = newAnswer.intern(); } newAnswers[i] = newAnswer; } for (int i = 0; i < size; i++) { TOK c = tokens.get(i); c.set(CoreAnnotations.AnswerAnnotation.class, newAnswers[i]); } } public static boolean isEntityBoundary(String beforeEntity, char beforePrefix, String afterEntity, char afterPrefix) { return ! beforeEntity.equals(afterEntity) || afterPrefix == 'B' || afterPrefix == 'S' || afterPrefix == 'U' || beforePrefix == 'E' || beforePrefix == 'L' || beforePrefix == 'S' || beforePrefix == 'U'; } public static boolean isSameEntityBoundary(String beforeEntity, char beforePrefix, String afterEntity, char afterPrefix) { return beforeEntity.equals(afterEntity) && (afterPrefix == 'B' || afterPrefix == 'S' || afterPrefix == 'U' || beforePrefix == 'E' || beforePrefix == 'L' || beforePrefix == 'S' || beforePrefix == 'U'); } public static boolean isDifferentEntityBoundary(String beforeEntity, String afterEntity) { return ! beforeEntity.equals(afterEntity); } /** For a sequence labeling task with multi-token entities, like NER, * this works out TP, FN, FP counts that can be used for entity-level * F1 results. This works with any kind of prefixed IOB labeling, or * just with simply entity names (also treated as IO labeling). * * @param doc The document (with Answer and GoldAnswer annotations) to score * @param entityTP Counter from entity type to count of true positives * @param entityFP Counter from entity type to count of false positives * @param entityFN Counter from entity type to count of false negatives * @param background The background symbol. Normally it isn't counted in entity-level * F1 scores. If you want it counted, pass in null for this. * @return Whether scoring was successful (it'll only be unsuccessful if information * is missing or ill-formed in the doc). */ public static boolean countEntityResults(List<? extends CoreMap> doc, Counter<String> entityTP, Counter<String> entityFP, Counter<String> entityFN, String background) { boolean entityCorrect = true; // the annotations String previousGold = background; String previousGuess = background; // the part after the I- or B- in the annotation String previousGoldEntity = ""; String previousGuessEntity = ""; char previousGoldPrefix = ' '; char previousGuessPrefix = ' '; for (CoreMap word : doc) { String gold = word.get(CoreAnnotations.GoldAnswerAnnotation.class); String guess = word.get(CoreAnnotations.AnswerAnnotation.class); String goldEntity; String guessEntity; char goldPrefix; char guessPrefix; if (gold == null || gold.isEmpty()) { log.info("Missing gold entity"); return false; } else if (gold.length() > 2 && gold.charAt(1) == '-') { goldEntity = gold.substring(2, gold.length()); goldPrefix = Character.toUpperCase(gold.charAt(0)); } else { goldEntity = gold; goldPrefix = ' '; } if (guess == null || guess.isEmpty()) { log.info("Missing guess entity"); return false; } else if (guess.length() > 2 && guess.charAt(1) == '-') { guessEntity = guess.substring(2, guess.length()); guessPrefix = Character.toUpperCase(guess.charAt(0)); } else { guessEntity = guess; guessPrefix = ' '; } //System.out.println("Gold: " + gold + " (" + goldPrefix + ' ' + goldEntity + "); " + // "Guess: " + guess + " (" + guessPrefix + ' ' + guessEntity + ')'); boolean newGold = ! gold.equals(background) && isEntityBoundary(previousGoldEntity, previousGoldPrefix, goldEntity, goldPrefix); boolean newGuess = ! guess.equals(background) && isEntityBoundary(previousGuessEntity, previousGuessPrefix, guessEntity, guessPrefix); boolean goldEnded = ! previousGold.equals(background) && isEntityBoundary(previousGoldEntity, previousGoldPrefix, goldEntity, goldPrefix); boolean guessEnded = ! previousGuess.equals(background) && isEntityBoundary(previousGuessEntity, previousGuessPrefix, guessEntity, guessPrefix); // System.out.println(" newGold " + newGold + "; newGuess " + newGuess + // "; goldEnded:" + goldEnded + "; guessEnded: " + guessEnded); if (goldEnded) { if (guessEnded) { if (entityCorrect) { entityTP.incrementCount(previousGoldEntity); } else { // same span but wrong label entityFN.incrementCount(previousGoldEntity); entityFP.incrementCount(previousGuessEntity); } entityCorrect = goldEntity.equals(guessEntity); } else { entityFN.incrementCount(previousGoldEntity); entityCorrect = gold.equals(background) && guess.equals(background); } } else if (guessEnded) { entityCorrect = false; entityFP.incrementCount(previousGuessEntity); } // nothing to do if neither gold nor guess have ended (a category change signals an end) if (newGold) { if (newGuess) { entityCorrect = guessEntity.equals(goldEntity); } else { entityCorrect = false; } } else if (newGuess) { entityCorrect = false; } previousGold = gold; previousGuess = guess; previousGoldEntity = goldEntity; previousGuessEntity = guessEntity; previousGoldPrefix = goldPrefix; previousGuessPrefix = guessPrefix; } // At the end, we need to check the last entity if ( ! previousGold.equals(background)) { if (entityCorrect) { entityTP.incrementCount(previousGoldEntity); } else { entityFN.incrementCount(previousGoldEntity); } } if ( ! previousGuess.equals(background)) { if ( ! entityCorrect) { entityFP.incrementCount(previousGuessEntity); } } return true; } /** Converts entity representation of a file. */ public static void main(String[] args) { // todo! if (args.length == 0) { } else { for (String arg : args) { } } } }