LengthDistribution.java example

Explorer
dependency-parsing-toolbox-master
- Source
package edu.stanford.nlp.parser.ensemble.utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

public class LengthDistribution {

    private Map<Integer, Integer> lengths = new HashMap<Integer, Integer>();
    private int totalCounts = 0;
    private Random random = new Random(0);
    private int _maxLength;

    /**
     * Construct a length distribution from the lengths of sentences in file
     * (CoNLL-X format)
     *
     * @param filename path to filename to measure the counts in
     * @param maxLength we count all lengths that are greater than maxLength in
     * one bucket (for the purposes of smoothing). Recommended value is around
     * 80.
     * @throws IOException
     */
    public LengthDistribution(String filename, int maxLength)
            throws IOException {
        _maxLength = maxLength;

        BufferedReader is = FileUtils.openForReading(filename);

        List<Token> sentence;
        while ((sentence = Token.readNextSentCoNLLX(is)) != null) {
            int length = sentence.size();
            int currentValue = getCount(length);
            lengths.put(length, currentValue + 1);
            totalCounts += 1;
        }
    }

    private int getCount(int length) {
        if (length > _maxLength) {
            length = _maxLength;
        }

        return lengths.containsKey(length) ? lengths.get(length) : 0;
    }

    /**
     * Tells you whether or not to accept a sentence according this length
     * distribution.
     *
     * @param length
     * @return
     */
    public boolean acceptLength(int length) {
        double prob = (double) getCount(length) / totalCounts;
        double sample = random.nextDouble();
        return sample <= prob;
    }

    @Override
    public String toString() {
        return "LengthDistribution [maxLength=" + _maxLength + ", lengths="
                + lengths + "]";
    }

    public static void main(String[] args) throws IOException {
        LengthDistribution lengths = new LengthDistribution(
                "/home/mcclosky/data/mihai-CoNLL08/train.ptb", 80);

        File inputDir = new File(args[0]);
        File outputDir = new File(args[1]);
        outputDir.mkdirs();

        int totalSentences = 0;
        int acceptedSentences = 0;
        for (String inputFilename : inputDir.list()) {
            File outputFile = new File(outputDir, inputFilename);
            System.out.format("%s -> %s\n", inputFilename, outputFile.getAbsolutePath());

            BufferedReader inputReader = FileUtils.openForReading(new File(
                    inputDir, inputFilename).getAbsolutePath());
            BufferedWriter outputWriter = FileUtils.openForWriting(outputFile.getAbsolutePath());

            List<Token> sentence;
            while ((sentence = Token.readNextSentCoNLLX(inputReader)) != null) {
                totalSentences++;
                int length = sentence.size();
                if (lengths.acceptLength(length)) {
                    acceptedSentences++;
                    Token.writeSentCoNLLX(sentence, outputWriter);
                }
            }
            System.out.format("accepted: %.1f%% (%s of %s)\n",
                    (double) acceptedSentences / totalSentences * 100,
                    acceptedSentences, totalSentences);

            inputReader.close();
            outputWriter.close();
        }
    }
}