package edu.stanford.nlp.loglinear.benchmarks; import edu.stanford.nlp.loglinear.model.ConcatVector; import edu.stanford.nlp.loglinear.model.ConcatVectorNamespace; import edu.stanford.nlp.loglinear.model.GraphicalModel; import java.util.List; import java.util.Map; /** * Created on 10/23/15. * @author keenon * <p> * This is a useful class for turning lists of tokens into a massively annotated PGM :) */ public class CoNLLFeaturizer { private static String getWordShape(String string) { if (string.toUpperCase().equals(string) && string.toLowerCase().equals(string)) return "no-case"; if (string.toUpperCase().equals(string)) return "upper-case"; if (string.toLowerCase().equals(string)) return "lower-case"; if (string.length() > 1 && Character.isUpperCase(string.charAt(0)) && string.substring(1).toLowerCase().equals(string.substring(1))) return "capitalized"; return "mixed-case"; } public static void annotate(GraphicalModel model, List<String> tags, ConcatVectorNamespace namespace, Map<String, double[]> embeddings) { for (int i = 0; i < model.variableMetaData.size(); i++) { Map<String, String> metadata = model.getVariableMetaDataByReference(i); String token = metadata.get("TOKEN"); String pos = metadata.get("POS"); String chunk = metadata.get("CHUNK"); Map<String, String> leftMetadata = null; if (i > 0) leftMetadata = model.getVariableMetaDataByReference(i - 1); String leftToken = (leftMetadata == null) ? "^" : leftMetadata.get("TOKEN"); String leftPos = (leftMetadata == null) ? "^" : leftMetadata.get("POS"); String leftChunk = (leftMetadata == null) ? "^" : leftMetadata.get("CHUNK"); Map<String, String> rightMetadata = null; if (i < model.variableMetaData.size() - 1) rightMetadata = model.getVariableMetaDataByReference(i + 1); String rightToken = (rightMetadata == null) ? "$" : rightMetadata.get("TOKEN"); String rightPos = (rightMetadata == null) ? "$" : rightMetadata.get("POS"); String rightChunk = (rightMetadata == null) ? "$" : rightMetadata.get("CHUNK"); // Add the unary factor GraphicalModel.Factor f = model.addFactor(new int[]{i}, new int[]{tags.size()}, (assignment) -> { // This is the anonymous function that generates a feature vector for each assignment to the unary // factor String tag = tags.get(assignment[0]); ConcatVector features = namespace.newVector(); namespace.setDenseFeature(features, "BIAS" + tag, new double[]{1.0}); namespace.setSparseFeature(features, "word" + tag, token, 1.0); if (embeddings != null && embeddings.containsKey(token)) { namespace.setDenseFeature(features, "embedding" + tag, embeddings.get(token)); } if (token.length() > 1) { namespace.setSparseFeature(features, "prefix1" + tag, token.substring(0, 1), 1.0); } if (token.length() > 2) { namespace.setSparseFeature(features, "prefix2" + tag, token.substring(0, 2), 1.0); } if (token.length() > 3) { namespace.setSparseFeature(features, "prefix3" + tag, token.substring(0, 3), 1.0); } if (token.length() > 1) { namespace.setSparseFeature(features, "suffix1" + tag, token.substring(token.length() - 1), 1.0); } if (token.length() > 2) { namespace.setSparseFeature(features, "suffix2" + tag, token.substring(token.length() - 2), 1.0); } if (token.length() > 3) { namespace.setSparseFeature(features, "suffix3" + tag, token.substring(token.length() - 3), 1.0); } namespace.setSparseFeature(features, "shape" + tag, getWordShape(token), 1.0); namespace.setSparseFeature(features, "pos" + tag, pos, 1.0); namespace.setSparseFeature(features, "chunk" + tag, chunk, 1.0); return features; }); assert (f.neigborIndices.length == 1); assert (f.neigborIndices[0] == i); // If this is not the last variable, add a binary factor if (i < model.variableMetaData.size() - 1) { GraphicalModel.Factor jf = model.addFactor(new int[]{i, i + 1}, new int[]{tags.size(), tags.size()}, (assignment) -> { // This is the anonymous function that generates a feature vector for every joint assignment to the // binary factor String thisTag = tags.get(assignment[0]); String nextTag = tags.get(assignment[1]); ConcatVector features = namespace.newVector(); namespace.setDenseFeature(features, "BIAS" + thisTag + nextTag, new double[]{1.0}); return features; }); assert (jf.neigborIndices.length == 2); assert (jf.neigborIndices[0] == i); assert (jf.neigborIndices[1] == i + 1); } } } }