ConvertTreesToTags.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.tagger.util; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.tagger.io.TaggedFileRecord;

/**
 * A short utility program that dumps out trees from multiple files
 * into one file of tagged text.  Useful for combining many parse tree
 * training files into one tagger training file, since the tagger
 * doesn't have convenient ways of reading in an entire directory.
 * <br>
 * There are a few command line arguments available:
 * <table>
 * <tr>
 * <td> -output <filename> </td>
 * <td> File to output the data to </td>
 * </tr>
 * <tr>
 * <td> -tagSeparator <separator> </td>
 * <td> Separator to use between word and tag </td>
 * </tr>
 * <tr>
 * <td> -treeRange <range> </td>
 * <td> If tree files have numbers, they will be filtered out if not
 *      in this range.  Can be null. </td>
 * </tr>
 * <tr>
 * <td> -inputEncoding <encoding> </td>
 * <td> Encoding to use when reading tree files </td>
 * </tr>
 * <tr>
 * <td> -outputEncoding <encoding> </td>
 * <td> Encoding to use when writing tags </td>
 * </tr>
 * <tr>
 * <td> -treeFilter <classname> </td>
 * <td> A Filter<Tree> to load by reflection which eliminates
 *      trees from the data read </td>
 * </tr>
 * <tr>
 * <td> -noTags </td>
 * <td> If present, will only output the words, no tags at all
 * </tr>
 * <tr>
 * <td> -noSpaces </td>
 * <td> If present, words will be concatenated together </td>
 * </tr>
 * </table>
 *
 * All other arguments will be treated as filenames to read.
 *
 * @author John Bauer
 */
public class ConvertTreesToTags  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ConvertTreesToTags.class);
  private ConvertTreesToTags() {}; // main method only

  public static void main(String[] args) throws IOException {
    String outputFilename = "";
    String tagSeparator = "";
    String treeRange = "";
    String inputEncoding = "UTF-8";
    String outputEncoding = "UTF-8";
    String treeFilter = "";
    boolean noTags = false;
    boolean noSpaces = false;
    List<String> inputFilenames = new ArrayList<>();
    for (int i = 0; i < args.length; ++i) {
      if ((args[i].equalsIgnoreCase("-output") ||
           args[i].equalsIgnoreCase("--output")) &&
          (i + 1 < args.length)) {
        outputFilename = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-tagSeparator") ||
                  args[i].equalsIgnoreCase("--tagSeparator")) &&
                 (i + 1 < args.length)) {
        tagSeparator = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-treeRange") ||
                  args[i].equalsIgnoreCase("--treeRange")) &&
                 (i + 1 < args.length)) {
        treeRange = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-inputEncoding") ||
                  args[i].equalsIgnoreCase("--inputEncoding")) &&
                 (i + 1 < args.length)) {
        inputEncoding = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-outputEncoding") ||
                  args[i].equalsIgnoreCase("--outputEncoding")) &&
                 (i + 1 < args.length)) {
        outputEncoding = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-treeFilter") ||
                  args[i].equalsIgnoreCase("--treeFilter")) &&
                 (i + 1< args.length)) {
        treeFilter = args[i + 1];
        i++;
      } else if (args[i].equalsIgnoreCase("-noTags") ||
                 args[i].equalsIgnoreCase("--noTags")) {
        noTags = true;
      } else if (args[i].equalsIgnoreCase("-noSpaces") ||
                 args[i].equalsIgnoreCase("--noSpaces")) {
        noSpaces = true;
      } else {
        inputFilenames.add(args[i]);
      }
    }
    if (outputFilename.equals("")) {
      log.info("Must specify an output filename, -output");
      System.exit(2);
    }
    if (inputFilenames.size() == 0) {
      log.info("Must specify one or more input filenames");
      System.exit(2);
    }

    FileOutputStream fos = new FileOutputStream(outputFilename);
    OutputStreamWriter osw = new OutputStreamWriter(fos, outputEncoding);
    BufferedWriter bout = new BufferedWriter(osw);
    Properties props = new Properties();
    for (String filename : inputFilenames) {
      String description = (TaggedFileRecord.FORMAT + "=" +
                            TaggedFileRecord.Format.TREES + "," + filename);
      if (!treeRange.equals("")) {
        description = (TaggedFileRecord.TREE_RANGE + "=" + treeRange +
                       "," + description);
      }
      if (!treeFilter.equals("")) {
        description = (TaggedFileRecord.TREE_FILTER + "=" + treeFilter +
                       "," + description);
      }
      description = (TaggedFileRecord.ENCODING + "=" + inputEncoding +
                     "," + description);
      TaggedFileRecord record =
        TaggedFileRecord.createRecord(props, description);
      for (List<TaggedWord> sentence : record.reader()) {
        String output = SentenceUtils.listToString(sentence, noTags, tagSeparator);
        if (noSpaces) {
          output = output.replaceAll(" ", "");
        }
        bout.write(output);
        bout.newLine();
      }
    }
    bout.flush();
    bout.close();
    osw.close();
    fos.close();
  }
}