package edu.stanford.nlp.tagger.util; import edu.stanford.nlp.util.logging.Redwood; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.List; import java.util.Properties; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.tagger.io.TaggedFileRecord; /** * A short utility program that dumps out trees from multiple files * into one file of tagged text. Useful for combining many parse tree * training files into one tagger training file, since the tagger * doesn't have convenient ways of reading in an entire directory. * <br> * There are a few command line arguments available: * <table> * <tr> * <td> -output <filename> </td> * <td> File to output the data to </td> * </tr> * <tr> * <td> -tagSeparator <separator> </td> * <td> Separator to use between word and tag </td> * </tr> * <tr> * <td> -treeRange <range> </td> * <td> If tree files have numbers, they will be filtered out if not * in this range. Can be null. </td> * </tr> * <tr> * <td> -inputEncoding <encoding> </td> * <td> Encoding to use when reading tree files </td> * </tr> * <tr> * <td> -outputEncoding <encoding> </td> * <td> Encoding to use when writing tags </td> * </tr> * <tr> * <td> -treeFilter <classname> </td> * <td> A Filter<Tree> to load by reflection which eliminates * trees from the data read </td> * </tr> * <tr> * <td> -noTags </td> * <td> If present, will only output the words, no tags at all * </tr> * <tr> * <td> -noSpaces </td> * <td> If present, words will be concatenated together </td> * </tr> * </table> * * All other arguments will be treated as filenames to read. * * @author John Bauer */ public class ConvertTreesToTags { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ConvertTreesToTags.class); private ConvertTreesToTags() {}; // main method only public static void main(String[] args) throws IOException { String outputFilename = ""; String tagSeparator = ""; String treeRange = ""; String inputEncoding = "UTF-8"; String outputEncoding = "UTF-8"; String treeFilter = ""; boolean noTags = false; boolean noSpaces = false; List<String> inputFilenames = new ArrayList<>(); for (int i = 0; i < args.length; ++i) { if ((args[i].equalsIgnoreCase("-output") || args[i].equalsIgnoreCase("--output")) && (i + 1 < args.length)) { outputFilename = args[i + 1]; i++; } else if ((args[i].equalsIgnoreCase("-tagSeparator") || args[i].equalsIgnoreCase("--tagSeparator")) && (i + 1 < args.length)) { tagSeparator = args[i + 1]; i++; } else if ((args[i].equalsIgnoreCase("-treeRange") || args[i].equalsIgnoreCase("--treeRange")) && (i + 1 < args.length)) { treeRange = args[i + 1]; i++; } else if ((args[i].equalsIgnoreCase("-inputEncoding") || args[i].equalsIgnoreCase("--inputEncoding")) && (i + 1 < args.length)) { inputEncoding = args[i + 1]; i++; } else if ((args[i].equalsIgnoreCase("-outputEncoding") || args[i].equalsIgnoreCase("--outputEncoding")) && (i + 1 < args.length)) { outputEncoding = args[i + 1]; i++; } else if ((args[i].equalsIgnoreCase("-treeFilter") || args[i].equalsIgnoreCase("--treeFilter")) && (i + 1< args.length)) { treeFilter = args[i + 1]; i++; } else if (args[i].equalsIgnoreCase("-noTags") || args[i].equalsIgnoreCase("--noTags")) { noTags = true; } else if (args[i].equalsIgnoreCase("-noSpaces") || args[i].equalsIgnoreCase("--noSpaces")) { noSpaces = true; } else { inputFilenames.add(args[i]); } } if (outputFilename.equals("")) { log.info("Must specify an output filename, -output"); System.exit(2); } if (inputFilenames.size() == 0) { log.info("Must specify one or more input filenames"); System.exit(2); } FileOutputStream fos = new FileOutputStream(outputFilename); OutputStreamWriter osw = new OutputStreamWriter(fos, outputEncoding); BufferedWriter bout = new BufferedWriter(osw); Properties props = new Properties(); for (String filename : inputFilenames) { String description = (TaggedFileRecord.FORMAT + "=" + TaggedFileRecord.Format.TREES + "," + filename); if (!treeRange.equals("")) { description = (TaggedFileRecord.TREE_RANGE + "=" + treeRange + "," + description); } if (!treeFilter.equals("")) { description = (TaggedFileRecord.TREE_FILTER + "=" + treeFilter + "," + description); } description = (TaggedFileRecord.ENCODING + "=" + inputEncoding + "," + description); TaggedFileRecord record = TaggedFileRecord.createRecord(props, description); for (List<TaggedWord> sentence : record.reader()) { String output = SentenceUtils.listToString(sentence, noTags, tagSeparator); if (noSpaces) { output = output.replaceAll(" ", ""); } bout.write(output); bout.newLine(); } } bout.flush(); bout.close(); osw.close(); fos.close(); } }