/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. */ package edu.cmu.sphinx.frontend.frequencywarp; import edu.cmu.sphinx.frontend.*; import edu.cmu.sphinx.util.props.*; /** * Filters an input power spectrum through a bank of number of mel-filters. The * output is an array of filtered values, typically called mel-spectrum, each * corresponding to the result of filtering the input spectrum through an * individual filter. Therefore, the length of the output array is equal to the * number of filters created. * <p> * The triangular mel-filters in the filter bank are placed in the frequency * axis so that each filter's center frequency follows the mel scale, in such a * way that the filter bank mimics the critical band, which represents * different perceptual effect at different frequency bands. Additionally, the * edges are placed so that they coincide with the center frequencies in * adjacent filters. Pictorially, the filter bank looks like: * <p> * <img alt="Mel filter bank" src="doc-files/melfilterbank.jpg"> <br> * <center><b>Figure 1: A Mel-filter bank. </b> </center> * <p> * As you might notice in the above figure, the distance at the base from the * center to the left edge is different from the center to the right edge. * Since the center frequencies follow the mel-frequency scale, which is a * non-linear scale that models the non-linear human hearing behavior, the mel * filter bank corresponds to a warping of the frequency axis. As can be * inferred from the figure, filtering with the mel scale emphasizes the lower * frequencies. A common model for the relation between frequencies in mel and * linear scales is as follows: * <p> * <code>melFrequency = 2595 * log(1 + linearFrequency/700)</code> * <p> * The constants that define the filterbank are the number of filters, the * minimum frequency, and the maximum frequency. The minimum and maximum * frequencies determine the frequency range spanned by the filterbank. These * frequencies depend on the channel and the sampling frequency that you are * using. For telephone speech, since the telephone channel corresponds to a * bandpass filter with cutoff frequencies of around 300Hz and 3700Hz, using * limits wider than these would waste bandwidth. For clean speech, the minimum * frequency should be higher than about 100Hz, since there is no speech * information below it. Furthermore, by setting the minimum frequency above * 50/60Hz, we get rid of the hum resulting from the AC power, if present. * <p> * The maximum frequency has to be lower than the Nyquist frequency, that is, * half the sampling rate. Furthermore, there is not much information above * 6800Hz that can be used for improving separation between models. * Particularly for very noisy channels, maximum frequency of around 5000Hz may * help cut off the noise. * <p> * Typical values for the constants defining the filter bank are: * <table summary="Filter bank params" width="80%" border="1"> * <tr> * <td><b>Sample rate (Hz) </b></td> * <td><b>16000 </b></td> * <td><b>11025 </b></td> * <td><b>8000 </b></td> * </tr> * <tr> * <td>{@link #PROP_NUMBER_FILTERS numberFilters}</td> * <td>40</td> * <td>36</td> * <td>31</td> * </tr> * <tr> * <td>{@link #PROP_MIN_FREQ minimumFrequency}(Hz)</td> * <td>130</td> * <td>130</td> * <td>200</td> * </tr> * <tr> * <td>{@link #PROP_MAX_FREQ maximumFrequency}(Hz)</td> * <td>6800</td> * <td>5400</td> * <td>3500</td> * </tr> * </table> * <p> * Davis and Mermelstein showed that Mel-frequency cepstral coefficients * present robust characteristics that are good for speech recognition. For * details, see Davis and Mermelstein, <i>Comparison of Parametric * Representations for Monosyllable Word Recognition in Continuously Spoken * Sentences, IEEE Transactions on Acoustic, Speech and Signal Processing, 1980 * </i>. * * @see MelFilter2 */ public class MelFrequencyFilterBank2 extends BaseDataProcessor { /** The property for the number of filters in the filterbank. */ @S4Integer(defaultValue = 40) public static final String PROP_NUMBER_FILTERS = "numberFilters"; /** The property for the minimum frequency covered by the filterbank. */ @S4Double(defaultValue = 130.0) public static final String PROP_MIN_FREQ = "minimumFrequency"; /** The property for the maximum frequency covered by the filterbank. */ @S4Double(defaultValue = 6800.0) public static final String PROP_MAX_FREQ = "maximumFrequency"; // ---------------------------------- // Configuration data // ---------------------------------- private int sampleRate; private int numberFilters; private double minFreq; private double maxFreq; private MelFilter2[] filters; public MelFrequencyFilterBank2(double minFreq, double maxFreq, int numberFilters) { initLogger(); this.minFreq = minFreq; this.maxFreq = maxFreq; this.numberFilters = numberFilters; } public MelFrequencyFilterBank2() { } /* * (non-Javadoc) * @see * edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx. * util.props.PropertySheet) */ @Override public void newProperties(PropertySheet ps) throws PropertyException { super.newProperties(ps); minFreq = ps.getDouble(PROP_MIN_FREQ); maxFreq = ps.getDouble(PROP_MAX_FREQ); numberFilters = ps.getInt(PROP_NUMBER_FILTERS); } /* * (non-Javadoc) * @see * edu.cmu.sphinx.frontend.DataProcessor#initialize(edu.cmu.sphinx.frontend * .CommonConfig) */ @Override public void initialize() { super.initialize(); } /** * Compute mel frequency from linear frequency. * * @param inputFreq the input frequency in linear scale * @return the frequency in a mel scale */ private double linearToMel(double inputFreq) { return 1127 * Math.log1p(inputFreq / 700); } /** * Build a mel filterbank with the parameters given. Each filter will be * shaped as a triangle. The triangles overlap so that they cover the whole * frequency range requested. The edges of a given triangle will be by * default at the center of the neighboring triangles. * * @param windowLength number of points in the power spectrum * @param numberFilters number of filters in the filterbank * @param minFreq lowest frequency in the range of interest * @param maxFreq highest frequency in the range of interest * @throws IllegalArgumentException */ private void buildFilterbank(int windowLength, int numberFilters, double minFreq, double maxFreq) throws IllegalArgumentException { assert windowLength > 0; assert numberFilters > 0; // Initialize edges and center freq. These variables will be updated so // that the center frequency of a filter is the right edge of the // filter to its left, and the left edge of the filter to its right. double minFreqMel = linearToMel(minFreq); double maxFreqMel = linearToMel(maxFreq); double deltaFreqMel = (maxFreqMel - minFreqMel) / (numberFilters + 1); // In fact, the ratio should be between <code>sampleRate / // 2</code> and <code>numberFftPoints / 2</code> since the number of // points in the power spectrum is half of the number of FFT points - // the other half would be symmetrical for a real sequence -, and // these points cover up to the Nyquist frequency, which is half of // the sampling rate. The two "divide by 2" get canceled out. double deltaFreq = (double) sampleRate / windowLength; double[] melPoints = new double[windowLength / 2]; filters = new MelFilter2[numberFilters]; for (int i = 0; i < windowLength / 2; ++i) melPoints[i] = linearToMel(i * deltaFreq); for (int i = 0; i < numberFilters; i++) { double centerMel = minFreqMel + (i + 1) * deltaFreqMel; filters[i] = new MelFilter2(centerMel, deltaFreqMel, melPoints); } } /** * Process data, creating the power spectrum from an input audio frame. * * @param input input power spectrum * @return power spectrum * @throws java.lang.IllegalArgumentException */ private DoubleData process(DoubleData input) throws IllegalArgumentException { double[] in = input.getValues(); int windowLength = (in.length - 1) << 1; if (filters == null || sampleRate != input.getSampleRate()) { sampleRate = input.getSampleRate(); buildFilterbank(windowLength, numberFilters, minFreq, maxFreq); } else if (in.length != ((windowLength >> 1) + 1)) { throw new IllegalArgumentException("Window size is incorrect: in.length == " + in.length + ", numberFftPoints == " + ((windowLength >> 1) + 1)); } double[] output = new double[numberFilters]; for (int i = 0; i < numberFilters; i++) output[i] = filters[i].apply(in); DoubleData outputMelSpectrum = new DoubleData(output, sampleRate, input.getFirstSampleNumber()); return outputMelSpectrum; } /** * Reads the next Data object, which is the power spectrum of an audio * input frame. Signals are returned unmodified. * * @return the next available Data or Signal object, or returns null if no * Data is available * @throws DataProcessingException if there is a data processing error */ @Override public Data getData() throws DataProcessingException { Data input = getPredecessor().getData(); if (input != null) { if (input instanceof DoubleData) { input = process((DoubleData) input); } } return input; } }