CRFFeatureExporter.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ie.crf;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Properties;

/**
 * Exports CRF features for use with other programs.
 * Usage: CRFFeatureExporter -prop crfClassifierPropFile -trainFile inputFile -exportFeatures outputFile
 * - Output file is automatically gzipped/b2zipped if ending in gz/bz2
 * - bzip2 requires that bzip2 is available via command line
 * - Currently exports features in a format that can be read by a modified crfsgd
 *   (crfsgd assumes features are gzipped)
 * TODO: Support other formats (like crfsuite)
 *
 * @author Angel Chang
 */
public class CRFFeatureExporter<IN extends CoreMap>  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(CRFFeatureExporter.class);
  private char delimiter = '\t';
  private static final String eol = System.lineSeparator();
  private CRFClassifier<IN> classifier;

  public CRFFeatureExporter(CRFClassifier<IN> classifier)
  {
    this.classifier = classifier;
  }

  /**
   * Prefix features with U- (for unigram) features
   * or B- (for bigram) features
   * @param feat String representing the feature
   * @return new prefixed feature string
   */
  private static String ubPrefixFeatureString(String feat)
  {
    if (feat.endsWith("|C")) {
      return "U-" + feat;
    } else if (feat.endsWith("|CpC")) {
      return "B-" + feat;
    } else {
      return feat;
    }
  }

  /**
   * Constructs a big string representing the input list of CoreLabel,
   *  with one line per token using the following format
   * word label feat1 feat2 ...
   *  (where each space is actually a tab).
   * Assumes that CoreLabel has both TextAnnotation and AnswerAnnotation.
   * @param document List of CoreLabel
   *        (does not have to represent a "document", just a sequence of text,
   *         like a sentence or a paragraph)
   * @return String representation of features
   */
  private String getFeatureString(List<IN> document) {
     int docSize = document.size();
     if (classifier.flags.useReverse) {
      Collections.reverse(document);
    }

    StringBuilder sb = new StringBuilder();
    for (int j = 0; j < docSize; j++) {
      IN token = document.get(j);
      sb.append(token.get(CoreAnnotations.TextAnnotation.class));
      sb.append(delimiter);
      sb.append(token.get(CoreAnnotations.AnswerAnnotation.class));

      CRFDatum<List<String>,CRFLabel> d = classifier.makeDatum(document, j, classifier.featureFactories);

      List<List<String>> features = d.asFeatures();
      for (Collection<String> cliqueFeatures : features) {
        List<String> sortedFeatures = new ArrayList<>(cliqueFeatures);
        Collections.sort(sortedFeatures);
        for (String feat : sortedFeatures) {
          feat = ubPrefixFeatureString(feat);
          sb.append(delimiter);
          sb.append(feat);
        }
      }
      sb.append(eol);
    }
    if (classifier.flags.useReverse) {
      Collections.reverse(document);
    }
    return sb.toString();
  }

  /**
   * Output features that have already been converted into features
   *  (using documentToDataAndLabels) in format suitable for CRFSuite.
   * Format is with one line per token using the following format
   * label feat1 feat2 ...
   *  (where each space is actually a tab)
   * Each document is separated by an empty line.
   *
   * @param exportFile file to export the features to
   * @param docsData array of document features
   * @param labels correct labels indexed by document, and position within document
   */
  public void printFeatures(String exportFile, int[][][][] docsData, int[][] labels)  {
    try {
      PrintWriter pw = IOUtils.getPrintWriter(exportFile);
      for (int i = 0; i < docsData.length; i++) {
        for (int j = 0; j < docsData[i].length; j++) {
          StringBuilder sb = new StringBuilder();
          int label = labels[i][j];
          sb.append(classifier.classIndex.get(label));
          for (int k = 0; k < docsData[i][j].length; k++) {
            for (int m = 0; m < docsData[i][j][k].length; m++) {
              String feat = classifier.featureIndex.get(docsData[i][j][k][m]);
              feat = ubPrefixFeatureString(feat);
              sb.append(delimiter);
              sb.append(feat);
            }
          }
          pw.println(sb.toString());
        }
        pw.println();
      }
      pw.close();
    } catch (IOException ex) {
      throw new RuntimeException(ex);
    }
  }

  /**
   * Output features from a collection of documents to a file
   * Format is with one line per token using the following format
   * word label feat1 feat2 ...
   *  (where each space is actually a tab)
   * Each document is separated by an empty line
   * This format is suitable for modified crfsgd.
   *
   * @param exportFile file to export the features to
   * @param documents input collection of documents
   */
  public void printFeatures(String exportFile, Collection<List<IN>> documents) {
    try {
      PrintWriter pw = IOUtils.getPrintWriter(exportFile);
      for (List<IN> doc:documents) {
        String str = getFeatureString(doc);
        pw.println(str);
      }
      pw.close();
    } catch (IOException ex) {
      throw new RuntimeException(ex);
    }
  }

  public static void main(String[] args) throws Exception {
    StringUtils.logInvocationString(log, args);
    Properties props = StringUtils.argsToProperties(args);
    CRFClassifier<CoreLabel> crf = new CRFClassifier<>(props);
    String inputFile = crf.flags.trainFile;
    if (inputFile == null) {
      log.info("Please provide input file using -trainFile");
      System.exit(-1);
    }
    String outputFile = crf.flags.exportFeatures;
    if (outputFile == null) {
      log.info("Please provide output file using -exportFeatures");
      System.exit(-1);
    }
    CRFFeatureExporter<CoreLabel> featureExporter = new CRFFeatureExporter<>(crf);
    Collection<List<CoreLabel>> docs =
      crf.makeObjectBankFromFile(inputFile, crf.makeReaderAndWriter());
    crf.makeAnswerArraysAndTagIndex(docs);
    featureExporter.printFeatures(outputFile, docs);
  }

}