ZipfDistribution.java example

Explorer
geogebra-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.math3.distribution;

import org.apache.commons.math3.exception.NotStrictlyPositiveException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.random.Well19937c;

/**
 * Implementation of the Zipf distribution.
 * <p>
 * <strong>Parameters:</strong>
 * For a random variable {@code X} whose values are distributed according to this
 * distribution, the probability mass function is given by
 * <pre>
 *   P(X = k) = H(N,s) * 1 / k^s    for {@code k = 1,2,...,N}.
 * </pre>
 * {@code H(N,s)} is the normalizing constant
 * which corresponds to the generalized harmonic number of order N of s.
 * <p>
 * <ul>
 * <li>{@code N} is the number of elements</li>
 * <li>{@code s} is the exponent</li>
 * </ul>
 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a>
 * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a>
 */
public class ZipfDistribution extends AbstractIntegerDistribution {
    /** Serializable version identifier. */
    private static final long serialVersionUID = -140627372283420404L;
    /** Number of elements. */
    private final int numberOfElements;
    /** Exponent parameter of the distribution. */
    private final double exponent;
    /** Cached numerical mean */
    private double numericalMean = Double.NaN;
    /** Whether or not the numerical mean has been calculated */
    private boolean numericalMeanIsCalculated = false;
    /** Cached numerical variance */
    private double numericalVariance = Double.NaN;
    /** Whether or not the numerical variance has been calculated */
    private boolean numericalVarianceIsCalculated = false;
    /** The sampler to be used for the sample() method */
    private transient ZipfRejectionInversionSampler sampler;

    /**
     * Create a new Zipf distribution with the given number of elements and
     * exponent.
     * <p>
     * <b>Note:</b> this constructor will implicitly create an instance of
     * {@link Well19937c} as random generator to be used for sampling only (see
     * {@link #sample()} and {@link #sample(int)}). In case no sampling is
     * needed for the created distribution, it is advised to pass {@code null}
     * as random generator via the appropriate constructors to avoid the
     * additional initialisation overhead.
     *
     * @param numberOfElements Number of elements.
     * @param exponent Exponent.
     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
     * or {@code exponent <= 0}.
     */
    public ZipfDistribution(final int numberOfElements, final double exponent) {
        this(new Well19937c(), numberOfElements, exponent);
    }

    /**
     * Creates a Zipf distribution.
     *
     * @param rng Random number generator.
     * @param numberOfElements Number of elements.
     * @param exponent Exponent.
     * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
     * or {@code exponent <= 0}.
     * @since 3.1
     */
    public ZipfDistribution(RandomGenerator rng,
                            int numberOfElements,
                            double exponent)
        throws NotStrictlyPositiveException {
        super(rng);

        if (numberOfElements <= 0) {
            throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
                                                   numberOfElements);
        }
        if (exponent <= 0) {
            throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
                                                   exponent);
        }

        this.numberOfElements = numberOfElements;
        this.exponent = exponent;
    }

    /**
     * Get the number of elements (e.g. corpus size) for the distribution.
     *
     * @return the number of elements
     */
    public int getNumberOfElements() {
        return numberOfElements;
    }

    /**
     * Get the exponent characterizing the distribution.
     *
     * @return the exponent
     */
    public double getExponent() {
        return exponent;
    }

    /** {@inheritDoc} */
    public double probability(final int x) {
        if (x <= 0 || x > numberOfElements) {
            return 0.0;
        }

        return (1.0 / Math.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
    }

    /** {@inheritDoc} */
    @Override
    public double logProbability(int x) {
        if (x <= 0 || x > numberOfElements) {
            return Double.NEGATIVE_INFINITY;
        }

        return -Math.log(x) * exponent - Math.log(generalizedHarmonic(numberOfElements, exponent));
    }

    /** {@inheritDoc} */
    public double cumulativeProbability(final int x) {
        if (x <= 0) {
            return 0.0;
        } else if (x >= numberOfElements) {
            return 1.0;
        }

        return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
    }

    /**
     * {@inheritDoc}
     *
     * For number of elements {@code N} and exponent {@code s}, the mean is
     * {@code Hs1 / Hs}, where
     * <ul>
     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
     * </ul>
     */
    public double getNumericalMean() {
        if (!numericalMeanIsCalculated) {
            numericalMean = calculateNumericalMean();
            numericalMeanIsCalculated = true;
        }
        return numericalMean;
    }

    /**
     * Used by {@link #getNumericalMean()}.
     *
     * @return the mean of this distribution
     */
    protected double calculateNumericalMean() {
        final int N = getNumberOfElements();
        final double s = getExponent();

        final double Hs1 = generalizedHarmonic(N, s - 1);
        final double Hs = generalizedHarmonic(N, s);

        return Hs1 / Hs;
    }

    /**
     * {@inheritDoc}
     *
     * For number of elements {@code N} and exponent {@code s}, the mean is
     * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
     * <ul>
     *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
     *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
     *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
     * </ul>
     */
    public double getNumericalVariance() {
        if (!numericalVarianceIsCalculated) {
            numericalVariance = calculateNumericalVariance();
            numericalVarianceIsCalculated = true;
        }
        return numericalVariance;
    }

    /**
     * Used by {@link #getNumericalVariance()}.
     *
     * @return the variance of this distribution
     */
    protected double calculateNumericalVariance() {
        final int N = getNumberOfElements();
        final double s = getExponent();

        final double Hs2 = generalizedHarmonic(N, s - 2);
        final double Hs1 = generalizedHarmonic(N, s - 1);
        final double Hs = generalizedHarmonic(N, s);

        return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
    }

    /**
     * Calculates the Nth generalized harmonic number. See
     * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
     * Series</a>.
     *
     * @param n Term in the series to calculate (must be larger than 1)
     * @param m Exponent (special case {@code m = 1} is the harmonic series).
     * @return the n<sup>th</sup> generalized harmonic number.
     */
    private double generalizedHarmonic(final int n, final double m) {
        double value = 0;
        for (int k = n; k > 0; --k) {
            value += 1.0 / Math.pow(k, m);
        }
        return value;
    }

    /**
     * {@inheritDoc}
     *
     * The lower bound of the support is always 1 no matter the parameters.
     *
     * @return lower bound of the support (always 1)
     */
    public int getSupportLowerBound() {
        return 1;
    }

    /**
     * {@inheritDoc}
     *
     * The upper bound of the support is the number of elements.
     *
     * @return upper bound of the support
     */
    public int getSupportUpperBound() {
        return getNumberOfElements();
    }

    /**
     * {@inheritDoc}
     *
     * The support of this distribution is connected.
     *
     * @return {@code true}
     */
    public boolean isSupportConnected() {
        return true;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public int sample() {
        if (sampler == null) {
            sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent);
        }
        return sampler.sample(random);
    }

    /**
	 * Utility class implementing a rejection inversion sampling method for a
	 * discrete, bounded Zipf distribution that is based on the method described
	 * in
	 * <p>
	 * Wolfgang Hoermann and Gerhard Derflinger
	 * "Rejection-inversion to generate variates from monotone discrete distributions."
	 * ACM Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996):
	 * 169-184.
	 * <p>
	 * The paper describes an algorithm for exponents larger than 1 (Algorithm
	 * ZRI). The original method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)}
	 * as the integral of the hat function. This function is undefined for q =
	 * 1, which is the reason for the limitation of the exponent. If instead the
	 * integral function {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is
	 * used, for which a meaningful limit exists for q = 1, the method works for
	 * all positive exponents.
	 * <p>
	 * The following implementation uses v := 0 and generates integral numbers
	 * in the range [1, numberOfElements]. This is different to the original
	 * method where v is defined to be positive and numbers are taken from [0,
	 * i_max]. This explains why the implementation looks slightly different.
	 *
	 * @since 3.6
	 */
    static final class ZipfRejectionInversionSampler {

        /** Exponent parameter of the distribution. */
        private final double exponent;
        /** Number of elements. */
        private final int numberOfElements;
        /** Constant equal to {@code hIntegral(1.5) - 1}. */
        private final double hIntegralX1;
        /** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */
        private final double hIntegralNumberOfElements;
        /** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */
        private final double s;

        /** Simple constructor.
         * @param numberOfElements number of elements
         * @param exponent exponent parameter of the distribution
         */
        ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) {
            this.exponent = exponent;
            this.numberOfElements = numberOfElements;
            this.hIntegralX1 = hIntegral(1.5) - 1d;
            this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5);
            this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2));
        }

        /** Generate one integral number in the range [1, numberOfElements].
         * @param random random generator to use
         * @return generated integral number in the range [1, numberOfElements]
         */
        int sample(final RandomGenerator random) {
            while(true) {

                final double u = hIntegralNumberOfElements + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements);
                // u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements]

                double x = hIntegralInverse(u);

                int k = (int)(x + 0.5);

                // Limit k to the range [1, numberOfElements]
                // (k could be outside due to numerical inaccuracies)
                if (k < 1) {
                    k = 1;
                }
                else if (k > numberOfElements) {
                    k = numberOfElements;
                }

                // Here, the distribution of k is given by:
                //
                //   P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C
                //   P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2
                //
                //   where C := 1 / (hIntegralNumberOfElements - hIntegralX1)

                if (k - x <= s || u >= hIntegral(k + 0.5) - h(k)) {

                    // Case k = 1:
                    //
                    //   The right inequality is always true, because replacing k by 1 gives
                    //   u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from
                    //   (hIntegralX1, hIntegralNumberOfElements].
                    //
                    //   Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1
                    //   and the probability that 1 is returned as random value is
                    //   P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent
                    //
                    // Case k >= 2:
                    //
                    //   The left inequality (k - x <= s) is just a short cut
                    //   to avoid the more expensive evaluation of the right inequality
                    //   (u >= hIntegral(k + 0.5) - h(k)) in many cases.
                    //
                    //   If the left inequality is true, the right inequality is also true:
                    //     Theorem 2 in the paper is valid for all positive exponents, because
                    //     the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and
                    //     (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0
                    //     are both fulfilled.
                    //     Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x))
                    //     is a non-decreasing function. If k - x <= s holds,
                    //     k - x <= s + f(k) - f(2) is obviously also true which is equivalent to
                    //     -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
                    //     -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
                    //     and finally u >= hIntegral(k + 0.5) - h(k).
                    //
                    //   Hence, the right inequality determines the acceptance rate:
                    //   P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2))
                    //   The probability that m is returned is given by
                    //   P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C / m^exponent.
                    //
                    // In both cases the probabilities are proportional to the probability mass function
                    // of the Zipf distribution.

                    return k;
                }
            }
        }

        /**
         * {@code H(x) :=}
         * <ul>
         * <li>{@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1}</li>
         * <li>{@code log(x)}, if {@code exponent == 1}</li>
         * </ul>
         * H(x) is an integral function of h(x),
         * the derivative of H(x) is h(x).
         *
         * @param x free parameter
         * @return {@code H(x)}
         */
        private double hIntegral(final double x) {
            final double logX = Math.log(x);
            return helper2((1d-exponent)*logX)*logX;
        }

        /**
         * {@code h(x) := 1/x^exponent}
         *
         * @param x free parameter
         * @return h(x)
         */
        private double h(final double x) {
            return Math.exp(-exponent * Math.log(x));
        }

        /**
         * The inverse function of H(x).
         *
         * @param x free parameter
         * @return y for which {@code H(y) = x}
         */
        private double hIntegralInverse(final double x) {
            double t = x*(1d-exponent);
            if (t < -1d) {
                // Limit value to the range [-1, +inf).
                // t could be smaller than -1 in some rare cases due to numerical errors.
                t = -1;
            }
            return Math.exp(helper1(t)*x);
        }

        /**
         * Helper function that calculates {@code log(1+x)/x}.
         * <p>
         * A Taylor series expansion is used, if x is close to 0.
         *
         * @param x a value larger than or equal to -1
         * @return {@code log(1+x)/x}
         */
        static double helper1(final double x) {
            if (Math.abs(x)>1e-8) {
                return Math.log1p(x)/x;
            }
            else {
                return 1.-x*((1./2.)-x*((1./3.)-x*(1./4.)));
            }
        }

        /**
         * Helper function to calculate {@code (exp(x)-1)/x}.
         * <p>
         * A Taylor series expansion is used, if x is close to 0.
         *
         * @param x free parameter
         * @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0
         */
        static double helper2(final double x) {
            if (Math.abs(x)>1e-8) {
                return Math.expm1(x)/x;
            }
            else {
                return 1.+x*(1./2.)*(1.+x*(1./3.)*(1.+x*(1./4.)));
            }
        }
    }
}