package edu.northwestern.at.utils.corpuslinguistics.syllablecounter;
/* Please see the license information at the end of this file. */
import java.io.*;
import java.lang.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import edu.northwestern.at.utils.*;
/** EnglishSyllableCounter: Counts syllables in English words.
*
* <p>
* This syllable counter uses a two-stage process for counting syllables
* in a word.
* </p>
*
* <ol>
* <li>The word is first looked up in a dictionary which maps words
* to their syllable counts. When the word is found in the
* dictionary, the associated syllable count is returned.
* </li>
* <li>When the word does not appear in the dictionary, the syllable
* count is computed from the number of vowel groups in the word.
* Adjustments are made for silent final "e"s and certain other
* letter groups.
* </ol>
*
* <p>
* Most of the data used to construct the dictionary of syllable counts
* comes from a pronunciation dictionary provided by Carnegie-Mellon
* University.
* </p>
*
* <p>
* The code used to compute syllable counts for words not found in the
* syllable counts dictionary is based upon that written by Greg Fast in
* Perl and Larry Ogrodnek in Java. Their methods provide the correct
* syllable count about 85-90% of the time. Rarely is the syllable
* count wrong by more than one.
* </p>
*/
public class EnglishSyllableCounter implements SyllableCounter
{
/** Path to map from spellings to syllable counts. */
protected static String syllableCountFileName =
"resources/englishsyllablecounts.tab";
/** Map of spellings to syllable counts. */
protected Map<String,Integer> syllableCountMap =
MapFactory.createNewMap();
protected static final Pattern[] SubtractSyllables =
new Pattern[]
{
Pattern.compile( "cial" ) ,
Pattern.compile( "tia" ) ,
Pattern.compile( "cius" ) ,
Pattern.compile( "cious" ) ,
Pattern.compile( "giu" ) , // belgium!
Pattern.compile( "ion" ) ,
Pattern.compile( "iou" ) ,
Pattern.compile( "sia$" ) ,
Pattern.compile( ".ely$" ) // absolutely! (but not ely!)
};
protected static final Pattern[] AddSyllables =
new Pattern[]
{
Pattern.compile( "ia" ),
Pattern.compile( "riet" ),
Pattern.compile( "dien" ),
Pattern.compile( "iu" ),
Pattern.compile( "io" ),
Pattern.compile( "ii" ),
Pattern.compile( "[aeiouym]bl$" ) , // -Vble, plus -mble
Pattern.compile( "[aeiou]{3}" ) , // agreeable
Pattern.compile( "^mc" ) ,
Pattern.compile( "ism$" ) , // -isms
Pattern.compile( "([^aeiouy])\1l$" ) , // middle twiddle battle bottle, etc.
Pattern.compile( "[^l]lien" ) , // alien, salient [1]
Pattern.compile( "^coa[dglx]." ) , // [2]
Pattern.compile( "[^gq]ua[^auieo]" ) , // i think this fixes more than it breaks
Pattern.compile( "dnt$" ) // couldn't
};
/** Create an English syllable counter. */
public EnglishSyllableCounter()
{
try
{
syllableCountMap =
loadSyllableCountMap
(
EnglishSyllableCounter.class.getResource
(
syllableCountFileName
) ,
"\t" ,
"" ,
"utf-8"
);
}
catch ( Exception e )
{
}
}
/** Load syllable counts map from a URL.
*
* @param mapURL URL for map file.
* @param separator Field separator.
* @param qualifier Quote character.
* @param encoding Character encoding for the file.
*
* @throws FileNotFoundException If input file does not exist.
* @throws IOException If input file cannot be opened.
*
* @return Map with values read from file.
*/
public Map<String, Integer> loadSyllableCountMap
(
URL mapURL ,
String separator ,
String qualifier ,
String encoding
)
throws IOException , FileNotFoundException
{
Map<String, Integer> map = MapFactory.createNewMap();
if ( mapURL != null )
{
BufferedReader bufferedReader =
new BufferedReader
(
new UnicodeReader
(
mapURL.openStream() ,
encoding
)
);
String inputLine = bufferedReader.readLine();
String[] tokens;
int count;
while ( inputLine != null )
{
tokens = inputLine.split( separator );
if ( tokens.length > 1 )
{
// Convert count token to a number.
count = Integer.parseInt( tokens[ 1 ] );
map.put( tokens[ 0 ] , count );
}
inputLine = bufferedReader.readLine();
}
bufferedReader.close();
}
return map;
}
/** Find number of syllables in a single English word.
*
* @param word The word whose syllable count is desired.
*
* @return The number of syllables in the word.
*/
public int countSyllables( String word )
{
int result = 0;
// Null or empty word?
// Syllable count is zero.
if ( ( word == null ) || ( word.length() == 0 ) )
{
return result;
}
// If word is in the dictionary,
// return the syllable count from the
// dictionary.
String lcWord = word.toLowerCase();
if ( syllableCountMap.containsKey( lcWord ) )
{
result = syllableCountMap.get( lcWord );
}
// If word is not in the dictionary,
// use vowel group counting to get
// the estimated syllable count.
else
{
// Remove embedded apostrophes and
// terminal e.
lcWord = lcWord.replaceAll( "'" , "" ).replaceAll( "e$" , "" );
// Split word into vowel groups.
String[] vowelGroups = lcWord.split( "[^aeiouy]+" );
// Handle special cases.
// Subtract from syllable count
// for these patterns.
for ( Pattern p : SubtractSyllables )
{
Matcher m = p.matcher( lcWord );
if ( m.find() )
{
result--;
}
}
// Add to syllable count for these patterns.
for ( Pattern p : AddSyllables )
{
Matcher m = p.matcher( lcWord );
if ( m.find() )
{
result++;
}
}
if ( lcWord.length() == 1 )
{
result++;
}
// Count vowel groupings.
if ( ( vowelGroups.length > 0 ) &&
( vowelGroups[ 0 ].length() == 0 )
)
{
result += vowelGroups.length - 1;
}
else
{
result += vowelGroups.length;
}
}
// Return syllable count of
// at least one.
return Math.max( result , 1 );
}
}
/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.
Developed by:
Academic and Research Technologies
Northwestern University
http://www.it.northwestern.edu/about/departments/at/
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimers.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimers in the documentation and/or other materials provided
with the distribution.
* Neither the names of Academic and Research Technologies,
Northwestern University, nor the names of its contributors may be
used to endorse or promote products derived from this Software
without specific prior written permission.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/