/*
* Copyright (C) 2011 Laurent Caillette
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.novelang.novelist;
import java.util.Locale;
import java.util.Map;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import org.novelang.logger.Logger;
import org.novelang.logger.LoggerFactory;
/**
* Defines character frequencies idepending on language.
*
* @author Laurent Caillette
*/
public class LetterDistribution extends Distribution< Character > {
private static final Logger LOGGER = LoggerFactory.getLogger( LetterDistribution.class ) ;
private LetterDistribution( final Map< Character, Float > frequencies ) {
super( LetterDistribution.class.getSimpleName(), frequencies ) ;
}
private final static Map< Locale, LetterDistribution > DISTRIBUTIONS = Maps.newHashMap() ;
/**
* Returns a {@code Map} between a character and its frequency in the given language.
*/
public synchronized static LetterDistribution getFrequency( final Locale locale ) {
final Locale supportedLocale ;
if( locale == SupportedLocales.DEFAULT_LOCALE ) {
supportedLocale = locale;
} else {
LOGGER.warn( "Unsupported: ", locale, ", using default: ", SupportedLocales.DEFAULT_LOCALE );
supportedLocale = SupportedLocales.DEFAULT_LOCALE;
}
if( ! DISTRIBUTIONS.containsKey( supportedLocale ) ) {
DISTRIBUTIONS.put( supportedLocale, new LetterDistribution( FRENCH_FREQUENCIES ) ) ;
}
return DISTRIBUTIONS.get( supportedLocale ) ;
}
/**
* These are frequencies for the French language, as documented by
* <a href="http://en.wikipedia.org/wiki/Letter_frequency" >Wikipedia</a>.
* <p>
* Here is the regex for turning the table into Java code. Use a Web browser with Gecko engine
* (Firefox, Camino) to copy the table cells of interest.
* See meaning of "{L}" in the regex in
* <a href="http://www.unicode.org/reports/tr18">Unicode Regular Expressions</a>,
* "General Category Property" chapter.
* <p/>
* <pre>
* (\p{L})\t(\d+(?:\.\d+)?)\%?$
* .put( '$1', $2f )
* </pre>
*/
private static final Map< Character, Float > FRENCH_FREQUENCIES =
new ImmutableMap.Builder< Character, Float >()
.put( 'a', 7.636f )
.put( 'b', 0.901f )
.put( 'c', 3.260f )
.put( 'd', 3.669f )
.put( 'e', 14.715f )
.put( 'f', 1.066f )
.put( 'g', 0.866f )
.put( 'h', 0.737f )
.put( 'i', 7.529f )
.put( 'j', 0.545f )
.put( 'k', 0.049f )
.put( 'l', 5.456f )
.put( 'm', 2.968f )
.put( 'n', 7.095f )
.put( 'o', 5.378f )
.put( 'p', 3.021f )
.put( 'q', 1.362f )
.put( 'r', 6.553f )
.put( 's', 7.948f )
.put( 't', 7.244f )
.put( 'u', 6.311f )
.put( 'v', 1.628f )
.put( 'w', 0.114f )
.put( 'x', 0.387f )
.put( 'y', 0.308f )
.put( 'z', 0.136f )
.put( 'à', 0.486f )
.put( 'å', 0.0f )
.put( 'ä', 0.0f )
.put( 'ą', 0.0f )
.put( 'œ', 0.018f )
.put( 'ç', 0.085f )
.put( 'ĉ', 0.0f )
.put( 'ć', 0.0f )
.put( 'è', 0.271f )
.put( 'é', 1.904f )
.put( 'ê', 0.225f )
.put( 'ë', 0.001f ) // Cheated: original is 0.000
.put( 'ę', 0.0f )
.put( 'ĝ', 0.0f )
.put( 'ğ', 0.0f )
.put( 'ĥ', 0.0f )
.put( 'î', 0.045f )
.put( 'ì', 0.0f )
.put( 'ï', 0.005f )
.put( 'ı', 0.0f )
.put( 'ĵ', 0.0f )
.put( 'ł', 0.0f )
.put( 'ñ', 0.0f )
.put( 'ń', 0.0f )
.put( 'ò', 0.0f )
.put( 'ö', 0.0f )
.put( 'ó', 0.0f )
.put( 'ŝ', 0.0f )
.put( 'ş', 0.0f )
.put( 'ś', 0.0f )
.put( 'ß', 0.0f )
.put( 'ù', 0.058f )
.put( 'ŭ', 0.0f )
.put( 'ü', 0.0f )
.put( 'ź', 0.0f )
.put( 'ż', 0.0f )
.build()
;
}