/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */ package edu.cmu.sphinx.frontend.frequencywarp; import edu.cmu.sphinx.frontend.BaseDataProcessor; import edu.cmu.sphinx.frontend.Data; import edu.cmu.sphinx.frontend.DataProcessingException; import edu.cmu.sphinx.frontend.DoubleData; import edu.cmu.sphinx.util.props.*; /** * Filters an input power spectrum through a bank of number of mel-filters. The output is an array of filtered values, * typically called mel-spectrum, each corresponding to the result of filtering the input spectrum through an individual * filter. Therefore, the length of the output array is equal to the number of filters created. * <p> * The triangular mel-filters in the filter bank are placed in the frequency axis so that each filter's center frequency * follows the mel scale, in such a way that the filter bank mimics the critical band, which represents different * perceptual effect at different frequency bands. Additionally, the edges are placed so that they coincide with the * center frequencies in adjacent filters. Pictorially, the filter bank looks like: * <p> * <img alt="Filterbank" src="doc-files/melfilterbank.jpg"> <br> <center><b>Figure 1: A Mel-filter bank. </b> </center> * <p> * As you might notice in the above figure, the distance at the base from the center to the left edge is different from * the center to the right edge. Since the center frequencies follow the mel-frequency scale, which is a non-linear * scale that models the non-linear human hearing behavior, the mel filter bank corresponds to a warping of the * frequency axis. As can be inferred from the figure, filtering with the mel scale emphasizes the lower frequencies. A * common model for the relation between frequencies in mel and linear scales is as follows: * <p> * <code>melFrequency = 2595 * log(1 + linearFrequency/700)</code> * <p> * The constants that define the filterbank are the number of filters, the minimum frequency, and the maximum frequency. * The minimum and maximum frequencies determine the frequency range spanned by the filterbank. These frequencies depend * on the channel and the sampling frequency that you are using. For telephone speech, since the telephone channel * corresponds to a bandpass filter with cutoff frequencies of around 300Hz and 3700Hz, using limits wider than these * would waste bandwidth. For clean speech, the minimum frequency should be higher than about 100Hz, since there is no * speech information below it. Furthermore, by setting the minimum frequency above 50/60Hz, we get rid of the hum * resulting from the AC power, if present. * <p> * The maximum frequency has to be lower than the Nyquist frequency, that is, half the sampling rate. Furthermore, there * is not much information above 6800Hz that can be used for improving separation between models. Particularly for very * noisy channels, maximum frequency of around 5000Hz may help cut off the noise. * <p> * Typical values for the constants defining the filter bank are: <table summary="Filterbank Values" width="80%" border="1"> <tr> <td><b>Sample rate * (Hz) </b></td> <td><b>16000 </b></td> <td><b>11025 </b></td> <td><b>8000 </b></td> </tr> <tr> <td>{@link * #PROP_NUMBER_FILTERS numberFilters}</td> <td>40</td> <td>36</td> <td>31</td> </tr> <tr> <td>{@link #PROP_MIN_FREQ * minimumFrequency}(Hz)</td> <td>130</td> <td>130</td> <td>200</td> </tr> <tr> <td>{@link #PROP_MAX_FREQ * maximumFrequency}(Hz)</td> <td>6800</td> <td>5400</td> <td>3500</td> </tr> </table> * <p> * Davis and Mermelstein showed that Mel-frequency cepstral coefficients present robust characteristics that are good * for speech recognition. For details, see Davis and Mermelstein, <i>Comparison of Parametric Representations for * Monosyllable Word Recognition in Continuously Spoken Sentences, IEEE Transactions on Acoustic, Speech and Signal * Processing, 1980 </i>. * * @see MelFilter */ public class MelFrequencyFilterBank extends BaseDataProcessor { /** The property for the number of filters in the filterbank. */ @S4Integer(defaultValue = 40) public static final String PROP_NUMBER_FILTERS = "numberFilters"; /** The property for the minimum frequency covered by the filterbank. */ @S4Double(defaultValue = 130.0) public static final String PROP_MIN_FREQ = "minimumFrequency"; /** The property for the maximum frequency covered by the filterbank. */ @S4Double(defaultValue = 6800.0) public static final String PROP_MAX_FREQ = "maximumFrequency"; // ---------------------------------- // Configuration data // ---------------------------------- private int sampleRate; private int numberFftPoints; private int numberFilters; private double minFreq; private double maxFreq; private MelFilter[] filter; public MelFrequencyFilterBank(double minFreq, double maxFreq, int numberFilters) { initLogger(); this.minFreq = minFreq; this.maxFreq = maxFreq; this.numberFilters = numberFilters; } public MelFrequencyFilterBank() { } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util.props.PropertySheet) */ @Override public void newProperties(PropertySheet ps) throws PropertyException { super.newProperties(ps); minFreq = ps.getDouble(PROP_MIN_FREQ); maxFreq = ps.getDouble(PROP_MAX_FREQ); numberFilters = ps.getInt(PROP_NUMBER_FILTERS); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.frontend.DataProcessor#initialize(edu.cmu.sphinx.frontend.CommonConfig) */ @Override public void initialize() { super.initialize(); } /** * Compute mel frequency from linear frequency. * <p> * Since we don't have <code>log10()</code>, we have to compute it using natural log: <b>log10(x) = ln(x) / ln(10) * </b> * * @param inputFreq the input frequency in linear scale * @return the frequency in a mel scale */ private double linToMelFreq(double inputFreq) { return (2595.0 * (Math.log(1.0 + inputFreq / 700.0) / Math.log(10.0))); } /** * Compute linear frequency from mel frequency. * * @param inputFreq the input frequency in mel scale * @return the frequency in a linear scale */ private double melToLinFreq(double inputFreq) { return (700.0 * (Math.pow(10.0, (inputFreq / 2595.0)) - 1.0)); } /** * Sets the given frequency to the nearest frequency bin from the FFT. The FFT can be thought of as a sampling of * the actual spectrum of a signal. We use this function to find the sampling point of the spectrum that is closest * to the given frequency. * * @param inFreq the input frequency * @param stepFreq the distance between frequency bins * @return the closest frequency bin * @throws IllegalArgumentException */ private double setToNearestFrequencyBin(double inFreq, double stepFreq) throws IllegalArgumentException { if (stepFreq == 0) { throw new IllegalArgumentException("stepFreq is zero"); } return stepFreq * Math.round(inFreq / stepFreq); } /** * Build a mel filterbank with the parameters given. Each filter will be shaped as a triangle. The triangles overlap * so that they cover the whole frequency range requested. The edges of a given triangle will be by default at the * center of the neighboring triangles. * * @param numberFftPoints number of points in the power spectrum * @param numberFilters number of filters in the filterbank * @param minFreq lowest frequency in the range of interest * @param maxFreq highest frequency in the range of interest * @throws IllegalArgumentException */ private void buildFilterbank(int numberFftPoints, int numberFilters, double minFreq, double maxFreq) throws IllegalArgumentException { double minFreqMel; double maxFreqMel; double deltaFreqMel; double[] leftEdge = new double[numberFilters]; double[] centerFreq = new double[numberFilters]; double[] rightEdge = new double[numberFilters]; double nextEdgeMel; double nextEdge; double initialFreqBin; double deltaFreq; this.filter = new MelFilter[numberFilters]; /** * In fact, the ratio should be between <code>sampleRate / * 2</code> * and <code>numberFftPoints / 2</code> since the number of points in * the power spectrum is half of the number of FFT points - the other * half would be symmetrical for a real sequence -, and these points * cover up to the Nyquist frequency, which is half of the sampling * rate. The two "divide by 2" get canceled out. */ if (numberFftPoints == 0) { throw new IllegalArgumentException("Number of FFT points is zero"); } deltaFreq = (double) sampleRate / numberFftPoints; /** * Initialize edges and center freq. These variables will be updated so * that the center frequency of a filter is the right edge of the * filter to its left, and the left edge of the filter to its right. */ if (numberFilters < 1) { throw new IllegalArgumentException("Number of filters illegal: " + numberFilters); } minFreqMel = linToMelFreq(minFreq); maxFreqMel = linToMelFreq(maxFreq); deltaFreqMel = (maxFreqMel - minFreqMel) / (numberFilters + 1); leftEdge[0] = setToNearestFrequencyBin(minFreq, deltaFreq); nextEdgeMel = minFreqMel; for (int i = 0; i < numberFilters; i++) { nextEdgeMel += deltaFreqMel; nextEdge = melToLinFreq(nextEdgeMel); centerFreq[i] = setToNearestFrequencyBin(nextEdge, deltaFreq); if (i > 0) { rightEdge[i - 1] = centerFreq[i]; } if (i < numberFilters - 1) { leftEdge[i + 1] = centerFreq[i]; } } nextEdgeMel = nextEdgeMel + deltaFreqMel; nextEdge = melToLinFreq(nextEdgeMel); rightEdge[numberFilters - 1] = setToNearestFrequencyBin(nextEdge, deltaFreq); for (int i = 0; i < numberFilters; i++) { initialFreqBin = setToNearestFrequencyBin(leftEdge[i], deltaFreq); if (initialFreqBin < leftEdge[i]) { initialFreqBin += deltaFreq; } //System.out.format("%d %f %f\n", i, leftEdge[i], rightEdge[i]); this.filter[i] = new MelFilter(leftEdge[i], centerFreq[i], rightEdge[i], initialFreqBin, deltaFreq); } } /** * Process data, creating the power spectrum from an input audio frame. * * @param input input power spectrum * @return power spectrum * @throws java.lang.IllegalArgumentException * */ private DoubleData process(DoubleData input) throws IllegalArgumentException { double[] in = input.getValues(); if (filter == null || sampleRate != input.getSampleRate()) { numberFftPoints = (in.length - 1) << 1; sampleRate = input.getSampleRate(); buildFilterbank(numberFftPoints, numberFilters, minFreq, maxFreq); } else if (in.length != ((numberFftPoints >> 1) + 1)) { throw new IllegalArgumentException( "Window size is incorrect: in.length == " + in.length + ", numberFftPoints == " + ((numberFftPoints >> 1) + 1)); } double[] output = new double[numberFilters]; /** * Filter input power spectrum */ for (int i = 0; i < numberFilters; i++) { output[i] = filter[i].filterOutput(in); } DoubleData outputMelSpectrum = new DoubleData(output, sampleRate, input.getFirstSampleNumber()); return outputMelSpectrum; } /** * Reads the next Data object, which is the power spectrum of an audio input frame. Signals are returned * unmodified. * * @return the next available Data or Signal object, or returns null if no Data is available * @throws DataProcessingException if there is a data processing error */ @Override public Data getData() throws DataProcessingException { Data input = getPredecessor().getData(); if (input != null) { if (input instanceof DoubleData) { input = process((DoubleData) input); } } return input; } }