package edu.stanford.nlp.parser.ensemble.utils; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; public class LengthDistribution { private Map<Integer, Integer> lengths = new HashMap<Integer, Integer>(); private int totalCounts = 0; private Random random = new Random(0); private int _maxLength; /** * Construct a length distribution from the lengths of sentences in file * (CoNLL-X format) * * @param filename path to filename to measure the counts in * @param maxLength we count all lengths that are greater than maxLength in * one bucket (for the purposes of smoothing). Recommended value is around * 80. * @throws IOException */ public LengthDistribution(String filename, int maxLength) throws IOException { _maxLength = maxLength; BufferedReader is = FileUtils.openForReading(filename); List<Token> sentence; while ((sentence = Token.readNextSentCoNLLX(is)) != null) { int length = sentence.size(); int currentValue = getCount(length); lengths.put(length, currentValue + 1); totalCounts += 1; } } private int getCount(int length) { if (length > _maxLength) { length = _maxLength; } return lengths.containsKey(length) ? lengths.get(length) : 0; } /** * Tells you whether or not to accept a sentence according this length * distribution. * * @param length * @return */ public boolean acceptLength(int length) { double prob = (double) getCount(length) / totalCounts; double sample = random.nextDouble(); return sample <= prob; } @Override public String toString() { return "LengthDistribution [maxLength=" + _maxLength + ", lengths=" + lengths + "]"; } public static void main(String[] args) throws IOException { LengthDistribution lengths = new LengthDistribution( "/home/mcclosky/data/mihai-CoNLL08/train.ptb", 80); File inputDir = new File(args[0]); File outputDir = new File(args[1]); outputDir.mkdirs(); int totalSentences = 0; int acceptedSentences = 0; for (String inputFilename : inputDir.list()) { File outputFile = new File(outputDir, inputFilename); System.out.format("%s -> %s\n", inputFilename, outputFile.getAbsolutePath()); BufferedReader inputReader = FileUtils.openForReading(new File( inputDir, inputFilename).getAbsolutePath()); BufferedWriter outputWriter = FileUtils.openForWriting(outputFile.getAbsolutePath()); List<Token> sentence; while ((sentence = Token.readNextSentCoNLLX(inputReader)) != null) { totalSentences++; int length = sentence.size(); if (lengths.acceptLength(length)) { acceptedSentences++; Token.writeSentCoNLLX(sentence, outputWriter); } } System.out.format("accepted: %.1f%% (%s of %s)\n", (double) acceptedSentences / totalSentences * 100, acceptedSentences, totalSentences); inputReader.close(); outputWriter.close(); } } }