package edu.stanford.nlp.tagger.util; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.tagger.maxent.TaggerConfig; import edu.stanford.nlp.tagger.io.TaggedFileRecord; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.StringUtils; import java.util.List; import java.util.Properties; import java.util.Random; /** * Takes a tagger data file of any format readable by the tagger and * outputs a new file containing tagged sentences which are prefixes * of the original data. The prefixes are of random length. If the * -fullSentence parameter is true, the original sentence is output * after each prefix. * <br> * Input is taken from the tagger file described in "input". Output * goes to stdout. * * @author John Bauer */ public class MakePrefixFile { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(MakePrefixFile.class); public static void main(String[] args) { Properties config = StringUtils.argsToProperties(args); log.info(config); boolean fullSentence = PropertiesUtils.getBool(config, "fullSentence", false); Random random = new Random(); String tagSeparator = config.getProperty("tagSeparator", TaggerConfig.TAG_SEPARATOR); TaggedFileRecord record = TaggedFileRecord.createRecord(config, config.getProperty("input")); for (List<TaggedWord> sentence : record.reader()) { int len = random.nextInt(sentence.size()) + 1; System.out.println(SentenceUtils.listToString(sentence.subList(0, len), false, tagSeparator)); if (fullSentence) { System.out.println(SentenceUtils.listToString(sentence, false, tagSeparator)); } } } }