package edu.stanford.nlp.ie.crf; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.logging.Redwood; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Properties; /** * Exports CRF features for use with other programs. * Usage: CRFFeatureExporter -prop crfClassifierPropFile -trainFile inputFile -exportFeatures outputFile * - Output file is automatically gzipped/b2zipped if ending in gz/bz2 * - bzip2 requires that bzip2 is available via command line * - Currently exports features in a format that can be read by a modified crfsgd * (crfsgd assumes features are gzipped) * TODO: Support other formats (like crfsuite) * * @author Angel Chang */ public class CRFFeatureExporter<IN extends CoreMap> { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(CRFFeatureExporter.class); private char delimiter = '\t'; private static final String eol = System.lineSeparator(); private CRFClassifier<IN> classifier; public CRFFeatureExporter(CRFClassifier<IN> classifier) { this.classifier = classifier; } /** * Prefix features with U- (for unigram) features * or B- (for bigram) features * @param feat String representing the feature * @return new prefixed feature string */ private static String ubPrefixFeatureString(String feat) { if (feat.endsWith("|C")) { return "U-" + feat; } else if (feat.endsWith("|CpC")) { return "B-" + feat; } else { return feat; } } /** * Constructs a big string representing the input list of CoreLabel, * with one line per token using the following format * word label feat1 feat2 ... * (where each space is actually a tab). * Assumes that CoreLabel has both TextAnnotation and AnswerAnnotation. * @param document List of CoreLabel * (does not have to represent a "document", just a sequence of text, * like a sentence or a paragraph) * @return String representation of features */ private String getFeatureString(List<IN> document) { int docSize = document.size(); if (classifier.flags.useReverse) { Collections.reverse(document); } StringBuilder sb = new StringBuilder(); for (int j = 0; j < docSize; j++) { IN token = document.get(j); sb.append(token.get(CoreAnnotations.TextAnnotation.class)); sb.append(delimiter); sb.append(token.get(CoreAnnotations.AnswerAnnotation.class)); CRFDatum<List<String>,CRFLabel> d = classifier.makeDatum(document, j, classifier.featureFactories); List<List<String>> features = d.asFeatures(); for (Collection<String> cliqueFeatures : features) { List<String> sortedFeatures = new ArrayList<>(cliqueFeatures); Collections.sort(sortedFeatures); for (String feat : sortedFeatures) { feat = ubPrefixFeatureString(feat); sb.append(delimiter); sb.append(feat); } } sb.append(eol); } if (classifier.flags.useReverse) { Collections.reverse(document); } return sb.toString(); } /** * Output features that have already been converted into features * (using documentToDataAndLabels) in format suitable for CRFSuite. * Format is with one line per token using the following format * label feat1 feat2 ... * (where each space is actually a tab) * Each document is separated by an empty line. * * @param exportFile file to export the features to * @param docsData array of document features * @param labels correct labels indexed by document, and position within document */ public void printFeatures(String exportFile, int[][][][] docsData, int[][] labels) { try { PrintWriter pw = IOUtils.getPrintWriter(exportFile); for (int i = 0; i < docsData.length; i++) { for (int j = 0; j < docsData[i].length; j++) { StringBuilder sb = new StringBuilder(); int label = labels[i][j]; sb.append(classifier.classIndex.get(label)); for (int k = 0; k < docsData[i][j].length; k++) { for (int m = 0; m < docsData[i][j][k].length; m++) { String feat = classifier.featureIndex.get(docsData[i][j][k][m]); feat = ubPrefixFeatureString(feat); sb.append(delimiter); sb.append(feat); } } pw.println(sb.toString()); } pw.println(); } pw.close(); } catch (IOException ex) { throw new RuntimeException(ex); } } /** * Output features from a collection of documents to a file * Format is with one line per token using the following format * word label feat1 feat2 ... * (where each space is actually a tab) * Each document is separated by an empty line * This format is suitable for modified crfsgd. * * @param exportFile file to export the features to * @param documents input collection of documents */ public void printFeatures(String exportFile, Collection<List<IN>> documents) { try { PrintWriter pw = IOUtils.getPrintWriter(exportFile); for (List<IN> doc:documents) { String str = getFeatureString(doc); pw.println(str); } pw.close(); } catch (IOException ex) { throw new RuntimeException(ex); } } public static void main(String[] args) throws Exception { StringUtils.logInvocationString(log, args); Properties props = StringUtils.argsToProperties(args); CRFClassifier<CoreLabel> crf = new CRFClassifier<>(props); String inputFile = crf.flags.trainFile; if (inputFile == null) { log.info("Please provide input file using -trainFile"); System.exit(-1); } String outputFile = crf.flags.exportFeatures; if (outputFile == null) { log.info("Please provide output file using -exportFeatures"); System.exit(-1); } CRFFeatureExporter<CoreLabel> featureExporter = new CRFFeatureExporter<>(crf); Collection<List<CoreLabel>> docs = crf.makeObjectBankFromFile(inputFile, crf.makeReaderAndWriter()); crf.makeAnswerArraysAndTagIndex(docs); featureExporter.printFeatures(outputFile, docs); } }