package edu.northwestern.at.utils.corpuslinguistics.inflector.pluralizer;
/* Please see the license information at the end of this file. */
import static edu.northwestern.at.utils.corpuslinguistics.inflector.wordrule.AbstractRegexReplacementRule.disjunction;
import static edu.northwestern.at.utils.corpuslinguistics.inflector.wordrule.IrregularMappingRule.toMap;
import java.util.*;
import java.util.regex.*;
import edu.northwestern.at.utils.corpuslinguistics.inflector.*;
import edu.northwestern.at.utils.corpuslinguistics.inflector.wordrule.*;
/** English language pluralizer.
*
* <p>
* Based upon the paper "An Algorithmic Approach to English Pluralization"
* by Damian Conway at
* <a href="http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html">
* http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html
* <a>.
* </p>
*
* <p>
* Original code written by Tom White under the Apache v2 license.
* Modified by Philip R. Burns for integration into MorphAdorner.
* </p>
*/
public class EnglishPluralizer
extends RuleBasedPluralizer
{
protected static final String POSTFIX_ADJECTIVE_REGEX =
"(" + "(?!major|lieutenant|brigadier|adjutant)\\S+(?=(?:-|\\s+)general)|" +
"court(?=(?:-|\\s+)martial)" + ")(.*)";
protected static final String[] PREPOSITIONS =
{
"about" ,
"above" ,
"across" ,
"after" ,
"among" ,
"around" ,
"at" ,
"athwart" ,
"before" ,
"behind" ,
"below" ,
"beneath" ,
"beside" ,
"besides" ,
"between" ,
"betwixt" ,
"beyond" ,
"but" ,
"by" ,
"during" ,
"except" ,
"for" ,
"from" ,
"in" ,
"into" ,
"near" ,
"of" ,
"off" ,
"on" ,
"onto" ,
"out" ,
"over" ,
"since" ,
"till" ,
"to" ,
"under" ,
"until" ,
"unto" ,
"upon" ,
"with" ,
};
protected static final Map<String , String> NOMINATIVE_PRONOUNS =
toMap
(
new String[][]
{
// nominative reflexive
{ "i" , "we" } ,
{ "myself" , "ourselves" } ,
{ "you" , "you" } ,
{ "yourself" , "yourselves" } ,
{ "she" , "they" } ,
{ "herself" , "themselves" } ,
{ "he" , "they" } ,
{ "himself" , "themselves" } ,
{ "it" , "they" } ,
{ "itself" , "themselves" } ,
{ "they" , "they" } ,
{ "themself" , "themselves" } ,
// possessive
{ "mine" , "ours" } ,
{ "yours" , "yours" } ,
{ "hers" , "theirs" } ,
{ "his" , "theirs" } ,
{ "its" , "theirs" } ,
{ "theirs" , "theirs" } ,
}
);
protected static final Map<String , String> ACCUSATIVE_PRONOUNS =
toMap
(
new String[][]
{
{ "me" , "us" } ,
{ "myself" , "ourselves" } ,
{ "you" , "you" } ,
{ "yourself" , "yourselves" } ,
{ "her" , "them" } ,
{ "herself" , "themselves" } ,
{ "him" , "them" } ,
{ "himself" , "themselves" } ,
{ "it" , "them" } ,
{ "itself" , "themselves" } ,
{ "them" , "them" } ,
{ "themself" , "themselves" } ,
}
);
protected static final Map<String , String> IRREGULAR_NOUNS =
toMap
(
new String[][]
{
{ "child" , "children" } ,
{ "brother" , "brothers" } , // irregular classical form
{ "loaf" , "loaves" } ,
{ "hoof" , "hoofs" } , // irregular classical form
{ "beef" , "beefs" } , // irregular classical form
{ "money" , "monies" } ,
{ "mongoose" , "mongooses" } ,
{ "ox" , "oxen" } ,
{ "cow" , "cows" } , // irregular classical form
{ "soliloquy" , "soliloquies" } ,
{ "graffito" , "graffiti" } ,
{ "prima donna" , "prima donnas" } , // irregular classical form
{ "octopus" , "octopuses" } , // irregular classical form
// { "octopus" , "octopodes" } , // irregular classical form
{ "genie" , "genies" } , // irregular classical form
{ "ganglion" , "ganglions" } , // irregular classical form
{ "trilby" , "trilbys" } ,
{ "turf" , "turfs" } , // irregular classical form
{ "numen" , "numina" } ,
{ "atman" , "atmas" } ,
{ "occiput" , "occiputs" } , // irregular classical form
// Words ending in -s
{ "corpus" , "corpuses" } , // irregular classical form
{ "opus" , "opuses" } , // irregular classical form
{ "genus" , "genera" } ,
{ "mythos" , "mythoi" } ,
{ "penis" , "penises" } , // irregular classical form
{ "testis" , "testes" } ,
{ "atlas" , "atlases" } , // irregular classical form
}
);
protected static final String[] CATEGORY_UNINFLECTED_NOUNS =
{
// Fish and herd animals
".*fish" ,
"tuna" ,
"salmon" ,
"mackerel" ,
"trout" ,
"bream" ,
"sea[- ]bass" ,
"carp" ,
"cod" ,
"flounder" ,
"whiting" ,
".*deer" ,
".*sheep" ,
"wildebeest" ,
"swine" ,
"eland" ,
"bison" ,
"buffalo" ,
"elk" ,
"moose" ,
"rhinoceros" ,
// Nationals ending in -ese
"Amoyese" ,
"Beninese" ,
"Bhutanese" ,
"Borghese" ,
"Burmese" ,
"Chinese" ,
"Congoese" ,
"Congolese" ,
"Faroese" ,
"Foochowese" ,
"Gabonese" ,
"Genevese" ,
"Genoese" ,
"Gilbertese" ,
"Guyanese" ,
"Hottentotese" ,
"Japanese" ,
"Kiplingese" ,
"Kongoese" ,
"Lebanese" ,
"Lucchese" ,
"Maltese" ,
"Marshallese" ,
"Nankingese" ,
"Nepalese" ,
"Niasese" ,
"Pekingese" ,
"Piedmontese" ,
"Pistoiese" ,
"Portuguese" ,
"Sammarinese" ,
"Sarawakese" ,
"Senegalese" ,
"Shavese" ,
"Sudanese" ,
"Togolese" ,
"Vermontese" ,
"Vietnamese" ,
"Wenchowese" ,
"Yengeese" ,
".*[nrlm]ese" ,
// Other nationalities.
"British" ,
"Burkinabe" ,
"French" ,
"I-Kiribati" ,
"Irish" ,
"Mahorais" ,
"Malagasy" ,
"Ni-Vanuatu" ,
"Seychellois" ,
"Spanish" ,
"Swiss" ,
"Taiwan" ,
"Thai" ,
// Diseases
".*pox" ,
// Other oddities
"graffiti" ,
"djinn" ,
// Words ending in -s
// Pairs or groups subsumed to a singular
"breeches" ,
"britches" ,
"clippers" ,
"gallows" ,
"hijinks" ,
"headquarters" ,
"pliers" ,
"scissors" ,
"testes" ,
"herpes" ,
"pincers" ,
"shears" ,
"proceedings" ,
"trousers" ,
// Unassimilated Latin 4th declension
"cantus" ,
"coitus" ,
"nexus" ,
// Recent imports
"contretemps" ,
"corps" ,
"debris" ,
".*ois" ,
"siemens" ,
// Diseases
".*measles" ,
"mumps" ,
// Others
"diabetes" ,
"jackanapes" ,
"series" ,
"species" ,
"rabies" ,
"chassis" ,
"innings" ,
"news" ,
"mews" ,
};
protected static final String[] CATEGORY_MAN_MANS_RULE =
{
"human" ,
"Alabaman" ,
"Bahaman" ,
"Burman" ,
"German" ,
"Hiroshiman" ,
"Liman" ,
"Nakayaman" ,
"Oklahoman" ,
"Panaman" ,
"Selman" ,
"Sonaman" ,
"Tacoman" ,
"Yakiman" ,
"Yokohaman" ,
"Yuman" ,
};
protected static final String[] CATEGORY_EX_ICES_RULE =
{
"codex" ,
"index" ,
"murex" ,
"silex" ,
"vertex"
};
protected static final String[] CATEGORY_IX_ICES_RULE =
{
"appendix" ,
"radix" ,
"helix" ,
"matrix"
};
protected static final String[] CATEGORY_UM_A_RULE =
{
"bacterium" ,
"agendum" ,
"desideratum" ,
"erratum" ,
"stratum" ,
"datum" ,
"ovum" ,
"extremum" ,
"candelabrum" ,
};
protected static final String[] CLASSICAL_UM_A =
{
"maximum",
"minimum",
"momentum",
"optimum",
"quantum",
"cranium",
"curriculum",
"dictum",
"phylum",
"aquarium",
"compendium",
"emporium",
"enconium",
"gymnasium",
"honorarium",
"interregnum",
"lustrum",
"memorandum",
"millennium",
"rostrum",
"spectrum",
"speculum",
"stadium",
"trapezium",
"ultimatum",
"medium",
"vacuum",
"velum",
"consortium",
};
protected static final String[] CATEGORY_US_I_RULE =
{
"alumnus" ,
"alveolus" ,
"bacillus" ,
"bronchus" ,
"locus" ,
"nucleus" ,
"stimulus" ,
"meniscus" ,
};
protected static final String[] CLASSICAL_US_I_RULE =
{
"focus",
"radius",
"genius",
"incubus",
"succubus",
"nimbus",
"fungus",
"nucleolus",
"stylus",
"torus",
"umbilicus",
"uterus",
"hippopotamus",
};
protected static final String[] CLASSICAL_US_US_RULE =
{
"status",
"apparatus",
"prospectus",
"sinus",
"hiatus",
"impetus",
"plexus",
};
protected static final String[] CATEGORY_ON_A_RULE =
{
"criterion" ,
"perihelion" ,
"aphelion" ,
"phenomenon" ,
"prolegomenon" ,
"noumenon" ,
"organon" ,
"asyndeton" ,
"hyperbaton" ,
};
protected static final String[] CLASSICAL_ON_A_RULE =
{
"oxymoron",
};
protected static final String[] CLASSICAL_A_ATA_RULE =
{
"anathema",
"bema",
"carcinoma",
"charisma",
"diploma",
"dogma",
"drama",
"edema",
"enema",
"enigma",
"lemma",
"lymphoma",
"magma",
"melisma",
"miasma",
"oedema",
"sarcoma",
"schema",
"soma",
"stigma",
"stoma",
"trauma",
"gumma",
"pragma",
};
protected static final String[] CATEGORY_A_AE_RULE =
{
"alumna" ,
"alga" ,
"vertebra" ,
"persona"
};
protected static final String[] CLASSICAL_A_AE_RULE =
{
"amoeba",
"antenna",
"formula",
"hyperbola",
"medusa",
"nebula",
"parabola",
"abscissa",
"hydra",
"nova",
"lacuna",
"aurora",
".*umbra",
"flora",
"fauna",
};
protected static final String[] CLASSICAL_EN_INA_RULE =
{
"stamen",
"foramen",
"lumen"
};
protected static final String[] CLASSICAL_O_I_RULE =
{
"solo",
"soprano",
"basso",
"alto",
"contralto",
"tempo",
"piano",
"virtuoso",
};
protected static final String[] CATEGORY_O_OS_RULE =
{
"albino" ,
"archipelago" ,
"armadillo" ,
"commando" ,
"crescendo" ,
"fiasco" ,
"ditto" ,
"dynamo" ,
"embryo" ,
"ghetto" ,
"guano" ,
"inferno" ,
"jumbo" ,
"lumbago" ,
"magneto" ,
"manifesto" ,
"medico" ,
"octavo" ,
"photo" ,
"pro" ,
"quarto" ,
"canto" ,
"lingo" ,
"generalissimo" ,
"stylo" ,
"rhino" ,
"casino" ,
"auto" ,
"macro" ,
"zero" ,
"solo" ,
"soprano" ,
"basso" ,
"alto" ,
"contralto" ,
"tempo" ,
"piano" ,
"virtuoso" ,
};
protected static final String[] CATEGORY_SINGULAR_S_RULE =
{
".*ss" ,
"acropolis" ,
"aegis" ,
"alias" ,
"asbestos" ,
"bathos" ,
"bias" ,
"bronchitis" ,
"bursitis" ,
"caddis" ,
"cannabis" ,
"canvas" ,
"chaos" ,
"cosmos" ,
"dais" ,
"digitalis" ,
"epidermis" ,
"ethos" ,
"eyas" ,
"gas" ,
"glottis" ,
"hubris" ,
"ibis" ,
"lens" ,
"mantis" ,
"marquis" ,
"metropolis" ,
"pathos" ,
"pelvis" ,
"polis" ,
"rhinoceros" ,
"sassafras" ,
"trellis" ,
".*us" ,
"[A-Z].*es" ,
"ephemeris" ,
"iris" ,
"clitoris" ,
"chrysalis" ,
"epididymis" ,
// Inflamations
".*itis" ,
};
protected static final String[] CLASSICAL_EIX_ICES_RULE =
{
"vortex",
"vertex",
"cortex",
"latex",
"pontifex",
"apex",
"index",
"simplex",
};
// References to Steps are to those in Conway's paper
protected final List<WordRule> rules =
Arrays.asList
(
new WordRule[]
{
// Blank word
new RegexReplacementRule( "^(\\s)$" , "$1" ) ,
// Nouns that do not inflect in the plural
// (such as "fish") [Step 2]
new CategoryInflectionRule
(
CATEGORY_UNINFLECTED_NOUNS , "-" , "-"
) ,
// Compounds [Step 12]
new AbstractRegexReplacementRule
(
"(?i)^(?:" + POSTFIX_ADJECTIVE_REGEX + ")$"
)
{
@Override public String replace( Matcher m )
{
return
EnglishPluralizer.this.pluralize( m.group( 1 ) ) +
m.group( 2 );
}
} ,
new AbstractRegexReplacementRule
(
"(?i)(.*?)((?:-|\\s+)(?:" + disjunction( PREPOSITIONS ) +
"|d[eu])(?:-|\\s+))a(?:-|\\s+)(.*)"
)
{
@Override public String replace( Matcher m )
{
return
EnglishPluralizer.this.pluralize( m.group( 1 ) ) +
m.group( 2 ) +
EnglishPluralizer.this.pluralize( m.group( 3 ) );
}
} ,
new AbstractRegexReplacementRule
(
"(?i)(.*?)((-|\\s+)(" + disjunction( PREPOSITIONS ) +
"|d[eu])((-|\\s+)(.*))?)"
)
{
@Override public String replace( Matcher m )
{
return
EnglishPluralizer.this.pluralize( m.group( 1 ) ) +
m.group( 2 );
}
} ,
// Pronouns [Step 3]
new IrregularMappingRule
(
NOMINATIVE_PRONOUNS ,
"(?i)" + disjunction( NOMINATIVE_PRONOUNS.keySet() )
) ,
new IrregularMappingRule
(
ACCUSATIVE_PRONOUNS ,
"(?i)" + disjunction( ACCUSATIVE_PRONOUNS.keySet() )
) ,
new IrregularMappingRule
(
ACCUSATIVE_PRONOUNS ,
"(?i)(" + disjunction( PREPOSITIONS ) + "\\s)" + "(" +
disjunction( ACCUSATIVE_PRONOUNS.keySet() ) + ")"
)
{
@Override public String replace( Matcher m )
{
return m.group( 1 ) +
mappings.get( m.group( 2 ).toLowerCase() );
}
} ,
// Standard irregular plurals
// (such as "children") [Step 4]
new IrregularMappingRule
(
IRREGULAR_NOUNS ,
"(?i)(.*)\\b" +
disjunction( IRREGULAR_NOUNS.keySet() ) + "$"
) ,
new CategoryInflectionRule
(
CATEGORY_MAN_MANS_RULE , "-man" , "-mans"
) ,
new RegexReplacementRule
(
"(?i)(\\S*)(person)$" ,
"$1people"
) ,
// Families of irregular plurals for common \
// suffixes (such as "-men") [Step 5]
new SuffixInflectionRule( "-man" , "-man" , "-men" ) ,
new SuffixInflectionRule( "-[lm]ouse" , "-ouse" , "-ice" ) ,
new SuffixInflectionRule( "-tooth" , "-tooth" , "-teeth" ) ,
new SuffixInflectionRule( "-goose" , "-goose" , "-geese" ) ,
new SuffixInflectionRule( "-foot" , "-foot" , "-feet" ) ,
// Assimilated irregular plurals [Step 6]
new SuffixInflectionRule( "-ceps" , "-" , "-" ) ,
new SuffixInflectionRule( "-zoon" , "-zoon" , "-zoa" ) ,
new SuffixInflectionRule( "-[csx]is" , "-is" , "-es" ) ,
new CategoryInflectionRule
(
CATEGORY_EX_ICES_RULE , "-ex" , "-ices"
) ,
new CategoryInflectionRule
(
CATEGORY_IX_ICES_RULE ,
"-ix" ,
"-ices"
) ,
new CategoryInflectionRule
(
CATEGORY_UM_A_RULE ,
"-um" ,
"-a"
) ,
new CategoryInflectionRule
(
CATEGORY_US_I_RULE ,
"-us" ,
"-i"
) ,
new CategoryInflectionRule
(
CATEGORY_ON_A_RULE ,
"-on" ,
"-a"
) ,
new CategoryInflectionRule
(
CATEGORY_A_AE_RULE ,
"-a" ,
"-ae"
) ,
// Classical irregular plurals [Step 7]
// Classical plurals have not been
// active.
/*
new CategoryInflectionRule
(
CLASSICAL_A_ATA_RULE ,
"-a" ,
"-ata"
) ,
new CategoryInflectionRule
(
CLASSICAL_A_AE_RULE ,
"-a" ,
"-ae"
) ,
new CategoryInflectionRule
(
CLASSICAL_EN_INA_RULE ,
"-en" ,
"-ina"
) ,
new CategoryInflectionRule
(
CLASSICAL_UM_A_RULE ,
"-um" ,
"-a"
) ,
new CategoryInflectionRule
(
CLASSICAL_US_I_RULE ,
"-us" ,
"-i"
) ,
new CategoryInflectionRule
(
CLASSICAL_US_US_RULE ,
"-us" ,
"-us"
) ,
new CategoryInflectionRule
(
CLASSICAL_ON_A_RULE ,
"-on" ,
"-a"
) ,
new CategoryInflectionRule
(
CLASSICAL_O_I_RULE ,
"-o" ,
"-i"
) ,
new CategoryInflectionRule
(
CLASSICAL_EIX_ICES_RULE ,
"-[ei]x" ,
"-ices"
) ,
*/
// Nouns ending in sibilants
// (such as "churches") [Step 8]
new CategoryInflectionRule
(
CATEGORY_SINGULAR_S_RULE ,
"-s" ,
"-ses"
) ,
new RegexReplacementRule( "^([A-Z].*s)$" , "$1es" ) ,
new SuffixInflectionRule( "-[cs]h" , "-h" , "-hes" ) ,
new SuffixInflectionRule( "-x" , "-x" , "-xes" ) ,
new SuffixInflectionRule( "-z" , "-z" , "-zes" ) ,
// Nouns ending with "-f" or "-fe"
// take "-ves" in the plural
// (such as "halves") [Step 9]
new SuffixInflectionRule( "-[aeo]lf" , "-f" , "-ves" ) ,
new SuffixInflectionRule( "-[^d]eaf" , "-f" , "-ves" ) ,
new SuffixInflectionRule( "-arf" , "-f" , "-ves" ) ,
new SuffixInflectionRule( "-[nlw]ife" , "-fe" , "-ves" ) ,
// Nouns ending with "-y" [Step 10]
new SuffixInflectionRule( "-[aeiou]y" , "-y" , "-ys" ) ,
new RegexReplacementRule( "^([A-Z].*y)$" , "$1s" ) ,
new SuffixInflectionRule( "-y" , "-y" , "-ies" ) ,
// Nouns ending with "-o" [Step 11]
new CategoryInflectionRule
(
CATEGORY_O_OS_RULE ,
"-o" ,
"-os"
) ,
new SuffixInflectionRule( "-[aeiou]o" , "-o" , "-os" ) ,
new SuffixInflectionRule( "-o" , "-o" , "-oes" ) ,
// Default rule: add "s" [Step 13]
new SuffixInflectionRule( "-" , "-s" ) ,
}
);
/** Create Engish pluralizer.
*/
public EnglishPluralizer()
{
setRules( rules );
setLocale( Locale.ENGLISH );
}
/** Fix case of pluralized word.
*
* @param trimmedWord The input word, with leading and trailing
* whitespace removed.
* @param pluralizedWord The pluralized word.
*
* @return The <code>pluralizedWord</code> after
* processing.
*
* <p>
* If <code>trimmedWord</code> is all uppercase, then
* <code>pluralizedWord</code> is uppercased.
* If <code>trimmedWord</code> is titlecase, then
* <code>pluralizedWord</code> is titlecased.
* </p>
*/
@Override
protected String postProcess
(
String trimmedWord ,
String pluralizedWord
)
{
if ( trimmedWord.matches( "^I$" ) )
{
return pluralizedWord;
}
return super.postProcess( trimmedWord , pluralizedWord );
}
}
/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.
Developed by:
Academic and Research Technologies
Northwestern University
http://www.it.northwestern.edu/about/departments/at/
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimers.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimers in the documentation and/or other materials provided
with the distribution.
* Neither the names of Academic and Research Technologies,
Northwestern University, nor the names of its contributors may be
used to endorse or promote products derived from this Software
without specific prior written permission.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/