package org.wikipedia.miner.util.text;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.Set;
import org.wikipedia.miner.util.text.yagoUtils.FinalMap;
import org.wikipedia.miner.util.text.yagoUtils.FinalSet;
/**
* This class is part of the Java Tools (see
* http://mpii.de/yago-naga/javatools). It is licensed under the Creative
* Commons Attribution License (see http://creativecommons.org/licenses/by/3.0)
* by the YAGO-NAGA team (see http://mpii.de/yago-naga).
*
*
* The PlingStemmer stems an English noun (plural or singular) to its singular
* form. It deals with "firemen"->"fireman", it knows Greek stuff like
* "appendices"->"appendix" and yes, it was a lot of work to compile these
* exceptions. Examples:
* <PRE>
* System.out.println(PlingStemmer.stem("boy"));
* ----> boy
* System.out.println(PlingStemmer.stem("boys"));
* ----> boy
* System.out.println(PlingStemmer.stem("biophysics"));
* ----> biophysics
* System.out.println(PlingStemmer.stem("automata"));
* ----> automaton
* System.out.println(PlingStemmer.stem("genus"));
* ----> genus
* System.out.println(PlingStemmer.stem("emus"));
* ----> emu
* </PRE><P>
*
* There are a number of word forms that can either be plural or singular.
* Examples include "physics" (the science or the plural of "physic" (the
* medicine)), "quarters" (the housing or the plural of "quarter" (1/4)) or
* "people" (the singular of "peoples" or the plural of "person"). In these
* cases, the stemmer assumes the word is a plural form and returns the singular
* form. The methods isPlural, isSingular and isPluralAndSingular can be used to
* differentiate the cases.<P>
*
* It cannot be guaranteed that the stemmer correctly stems a plural word or
* correctly ignores a singular word -- let alone that it treats an ambiguous
* word form in the way expected by the user.<P>
*
* The PlingStemmer uses material from <A
* HREF=http://wordnet.princeton.edu/>WordNet</A>.<P> It requires the class
* FinalSet from the <A
* HREF=http://www.mpii.mpg.de/~suchanek/downloads/javatools> Java Tools</A>.
*/
public class PlingStemmer {
/**
* Tells whether a word form is plural. This method just checks whether the
* stem method alters the word
*/
public boolean isPlural(String s) {
return (!s.equals(stem(s)));
}
/**
* Tells whether a word form is singular. Note that a word can be both
* plural and singular
*/
public boolean isSingular(String s) {
return (singAndPlur.contains(s.toLowerCase()) || !isPlural(s));
}
/**
* Tells whether a word form is the singular form of one word and at the
* same time the plural form of another.
*/
public boolean isSingularAndPlural(String s) {
return (singAndPlur.contains(s.toLowerCase()));
}
/**
* Cuts a suffix from a string (that is the number of chars given by the
* suffix)
*/
public String cut(String s, String suffix) {
return (s.substring(0, s.length() - suffix.length()));
}
/**
* Returns true if a word is probably not Latin
*/
public boolean noLatin(String s) {
return (s.indexOf('h') > 0 || s.indexOf('j') > 0 || s.indexOf('k') > 0
|| s.indexOf('w') > 0 || s.indexOf('y') > 0 || s.indexOf('z') > 0
|| s.indexOf("ou") > 0 || s.indexOf("sh") > 0 || s.indexOf("ch") > 0
|| s.endsWith("aus"));
}
/**
* Returns true if a word is probably Greek
*/
private boolean greek(String s) {
return (s.indexOf("ph") > 0 || s.indexOf('y') > 0 && s.endsWith("nges"));
}
/**
* Stems an English noun to its singular form
*/
public String stem(String s) {
String stem = s;
// Handle irregular ones
String irreg = irregular.get(s);
if (irreg != null) {
return (stem = irreg);
}
// -on to -a
if (categoryON_A.contains(s)) {
return (stem = cut(s, "a") + "on");
}
// -um to -a
if (categoryUM_A.contains(s)) {
return (stem = cut(s, "a") + "um");
}
// -x to -ices
if (categoryIX_ICES.contains(s)) {
return (stem = cut(s, "ices") + "ix");
}
// -o to -i
if (categoryO_I.contains(s)) {
return (stem = cut(s, "i") + "o");
}
// -se to ses
if (categorySE_SES.contains(s)) {
return (stem = cut(s, "s"));
}
// -is to -es
if (categoryIS_ES.contains(s) || s.endsWith("theses")) {
return (stem = cut(s, "es") + "is");
}
// -us to -i
if (categoryUS_I.contains(s)) {
return (stem = cut(s, "i") + "us");
}
//Wrong plural
if (s.endsWith("uses") && (categoryUS_I.contains(cut(s, "uses") + "i")
|| s.equals("genuses") || s.equals("corpuses"))) {
return (stem = cut(s, "es"));
}
// -ex to -ices
if (categoryEX_ICES.contains(s)) {
return (stem = cut(s, "ices") + "ex");
}
// Words that do not inflect in the plural
if (s.endsWith("ois") || s.endsWith("itis") || category00.contains(s) || categoryICS.contains(s)) {
return (stem = s);
}
// -en to -ina
// No other common words end in -ina
if (s.endsWith("ina")) {
return (stem = cut(s, "en"));
}
// -a to -ae
// No other common words end in -ae
if (s.endsWith("ae")) {
return (stem = cut(s, "e"));
}
// -a to -ata
// No other common words end in -ata
if (s.endsWith("ata")) {
return (stem = cut(s, "ta"));
}
// trix to -trices
// No common word ends with -trice(s)
if (s.endsWith("trices")) {
return (stem = cut(s, "trices") + "trix");
}
// -us to -us
//No other common word ends in -us, except for false plurals of French words
//Catch words that are not latin or known to end in -u
if (s.endsWith("us") && !s.endsWith("eaus") && !s.endsWith("ieus") && !noLatin(s)
&& !categoryU_US.contains(s)) {
return (stem = s);
}
// -tooth to -teeth
// -goose to -geese
// -foot to -feet
// -zoon to -zoa
//No other common words end with the indicated suffixes
if (s.endsWith("teeth")) {
return (stem = cut(s, "teeth") + "tooth");
}
if (s.endsWith("geese")) {
return (stem = cut(s, "geese") + "goose");
}
if (s.endsWith("feet")) {
return (stem = cut(s, "feet") + "foot");
}
if (s.endsWith("zoa")) {
return (stem = cut(s, "zoa") + "zoon");
}
// -eau to -eaux
//No other common words end in eaux
if (s.endsWith("eaux")) {
return (stem = cut(s, "x"));
}
// -ieu to -ieux
//No other common words end in ieux
if (s.endsWith("ieux")) {
return (stem = cut(s, "x"));
}
// -nx to -nges
// Pay attention not to kill words ending in -nge with plural -nges
// Take only Greek words (works fine, only a handfull of exceptions)
if (s.endsWith("nges") && greek(s)) {
return (stem = cut(s, "nges") + "nx");
}
// -[sc]h to -[sc]hes
//No other common word ends with "shes", "ches" or "she(s)"
//Quite a lot end with "che(s)", filter them out
if (s.endsWith("shes") || s.endsWith("ches") && !categoryCHE_CHES.contains(s)) {
return (stem = cut(s, "es"));
}
// -ss to -sses
// No other common singular word ends with "sses"
// Filter out those ending in "sse(s)"
if (s.endsWith("sses") && !categorySSE_SSES.contains(s) && !s.endsWith("mousses")) {
return (stem = cut(s, "es"));
}
// -x to -xes
// No other common word ends with "xe(s)" except for "axe"
if (s.endsWith("xes") && !s.equals("axes")) {
return (stem = cut(s, "es"));
}
// -[nlw]ife to -[nlw]ives
//No other common word ends with "[nlw]ive(s)" except for olive
if (s.endsWith("nives") || s.endsWith("lives") && !s.endsWith("olives")
|| s.endsWith("wives")) {
return (stem = cut(s, "ves") + "fe");
}
// -[aeo]lf to -ves exceptions: valve, solve
// -[^d]eaf to -ves exceptions: heave, weave
// -arf to -ves no exception
if (s.endsWith("alves") && !s.endsWith("valves")
|| s.endsWith("olves") && !s.endsWith("solves")
|| s.endsWith("eaves") && !s.endsWith("heaves") && !s.endsWith("weaves")
|| s.endsWith("arves")) {
return (stem = cut(s, "ves") + "f");
}
// -y to -ies
// -ies is very uncommon as a singular suffix
// but -ie is quite common, filter them out
if (s.endsWith("ies") && !categoryIE_IES.contains(s)) {
return (stem = cut(s, "ies") + "y");
}
// -o to -oes
// Some words end with -oe, so don't kill the "e"
if (s.endsWith("oes") && !categoryOE_OES.contains(s)) {
return (stem = cut(s, "es"));
}
// -s to -ses
// -z to -zes
// no words end with "-ses" or "-zes" in singular
if (s.endsWith("ses") || s.endsWith("zes")) {
return (stem = cut(s, "es"));
}
// - to -s
if (s.endsWith("s") && !s.endsWith("ss") && !s.endsWith("is")) {
return (stem = cut(s, "s"));
}
return stem;
}
/**
* Words that end in "-se" in their plural forms (like "nurse" etc.)
*/
public Set<String> categorySE_SES = new FinalSet<String>(
"nurses",
"cruises",
"premises",
"houses");
/**
* Words that do not have a distinct plural form (like "atlas" etc.)
*/
public Set<String> category00 = new FinalSet<String>(
"alias",
"asbestos",
"atlas",
"barracks",
"bathos",
"bias",
"breeches",
"britches",
"canvas",
"chaos",
"clippers",
"contretemps",
"corps",
"cosmos",
"crossroads",
"diabetes",
"ethos",
"gallows",
"gas",
"graffiti",
"headquarters",
"herpes",
"high-jinks",
"innings",
"jackanapes",
"lens",
"means",
"measles",
"mews",
"mumps",
"news",
"pathos",
"pincers",
"pliers",
"proceedings",
"rabies",
"rhinoceros",
"sassafras",
"scissors",
"series",
"shears",
"species",
"tuna");
/**
* Words that change from "-um" to "-a" (like "curriculum" etc.), listed in
* their plural forms
*/
public Set<String> categoryUM_A = new FinalSet<String>(
"addenda",
"agenda",
"aquaria",
"bacteria",
"candelabra",
"compendia",
"consortia",
"crania",
"curricula",
"data",
"desiderata",
"dicta",
"emporia",
"enconia",
"errata",
"extrema",
"gymnasia",
"honoraria",
"interregna",
"lustra",
"maxima",
"media",
"memoranda",
"millenia",
"minima",
"momenta",
"optima",
"ova",
"phyla",
"quanta",
"rostra",
"spectra",
"specula",
"stadia",
"strata",
"symposia",
"trapezia",
"ultimata",
"vacua",
"vela");
/**
* Words that change from "-on" to "-a" (like "phenomenon" etc.), listed in
* their plural forms
*/
public Set<String> categoryON_A = new FinalSet<String>(
"aphelia",
"asyndeta",
"automata",
"criteria",
"hyperbata",
"noumena",
"organa",
"perihelia",
"phenomena",
"prolegomena");
/**
* Words that change from "-o" to "-i" (like "libretto" etc.), listed in
* their plural forms
*/
public Set<String> categoryO_I = new FinalSet<String>(
"alti",
"bassi",
"canti",
"contralti",
"crescendi",
"libretti",
"soli",
"soprani",
"tempi",
"virtuosi");
/**
* Words that change from "-us" to "-i" (like "fungus" etc.), listed in
* their plural forms
*/
public Set<String> categoryUS_I = new FinalSet<String>(
"alumni",
"bacilli",
"cacti",
"foci",
"fungi",
"genii",
"hippopotami",
"incubi",
"nimbi",
"nuclei",
"nucleoli",
"octopi",
"radii",
"stimuli",
"styli",
"succubi",
"syllabi",
"termini",
"tori",
"umbilici",
"uteri");
/**
* Words that change from "-ix" to "-ices" (like "appendix" etc.), listed in
* their plural forms
*/
public Set<String> categoryIX_ICES = new FinalSet<String>(
"appendices",
"cervices");
/**
* Words that change from "-is" to "-es" (like "axis" etc.), listed in their
* plural forms
*/
public Set<String> categoryIS_ES = new FinalSet<String>(
// plus everybody ending in theses
"analyses",
"axes",
"bases",
"crises",
"diagnoses",
"ellipses",
"emphases",
"neuroses",
"oases",
"paralyses",
"synopses");
/**
* Words that change from "-oe" to "-oes" (like "toe" etc.), listed in their
* plural forms
*/
public Set<String> categoryOE_OES = new FinalSet<String>(
"aloes",
"backhoes",
"beroes",
"canoes",
"chigoes",
"cohoes",
"does",
"felloes",
"floes",
"foes",
"gumshoes",
"hammertoes",
"hoes",
"hoopoes",
"horseshoes",
"leucothoes",
"mahoes",
"mistletoes",
"oboes",
"overshoes",
"pahoehoes",
"pekoes",
"roes",
"shoes",
"sloes",
"snowshoes",
"throes",
"tic-tac-toes",
"tick-tack-toes",
"ticktacktoes",
"tiptoes",
"tit-tat-toes",
"toes",
"toetoes",
"tuckahoes",
"woes");
/**
* Words that change from "-ex" to "-ices" (like "index" etc.), listed in
* their plural forms
*/
public Set<String> categoryEX_ICES = new FinalSet<String>(
"apices",
"codices",
"cortices",
"indices",
"latices",
"murices",
"pontifices",
"silices",
"simplices",
"vertices",
"vortices");
/**
* Words that change from "-u" to "-us" (like "emu" etc.), listed in their
* plural forms
*/
public Set<String> categoryU_US = new FinalSet<String>(
"apercus",
"barbus",
"cornus",
"ecrus",
"emus",
"fondus",
"gnus",
"iglus",
"mus",
"nandus",
"napus",
"poilus",
"quipus",
"snafus",
"tabus",
"tamandus",
"tatus",
"timucus",
"tiramisus",
"tofus",
"tutus");
/**
* Words that change from "-sse" to "-sses" (like "finesse" etc.), listed in
* their plural forms
*/
public Set<String> categorySSE_SSES = new FinalSet<String>(
//plus those ending in mousse
"bouillabaisses",
"coulisses",
"crevasses",
"crosses",
"cuisses",
"demitasses",
"ecrevisses",
"fesses",
"finesses",
"fosses",
"impasses",
"lacrosses",
"largesses",
"masses",
"noblesses",
"palliasses",
"pelisses",
"politesses",
"posses",
"tasses",
"wrasses");
/**
* Words that change from "-che" to "-ches" (like "brioche" etc.), listed in
* their plural forms
*/
public Set<String> categoryCHE_CHES = new FinalSet<String>(
"adrenarches",
"attaches",
"avalanches",
"barouches",
"brioches",
"caches",
"caleches",
"caroches",
"cartouches",
"cliches",
"cloches",
"creches",
"demarches",
"douches",
"gouaches",
"guilloches",
"headaches",
"heartaches",
"huaraches",
"menarches",
"microfiches",
"moustaches",
"mustaches",
"niches",
"panaches",
"panoches",
"pastiches",
"penuches",
"pinches",
"postiches",
"psyches",
"quiches",
"schottisches",
"seiches",
"soutaches",
"synecdoches",
"thelarches",
"troches");
/**
* Words that end with "-ics" and do not exist as nouns without the 's'
* (like "aerobics" etc.)
*/
public Set<String> categoryICS = new FinalSet<String>(
"aerobatics",
"aerobics",
"aerodynamics",
"aeromechanics",
"aeronautics",
"alphanumerics",
"animatronics",
"apologetics",
"architectonics",
"astrodynamics",
"astronautics",
"astrophysics",
"athletics",
"atmospherics",
"autogenics",
"avionics",
"ballistics",
"bibliotics",
"bioethics",
"biometrics",
"bionics",
"bionomics",
"biophysics",
"biosystematics",
"cacogenics",
"calisthenics",
"callisthenics",
"catoptrics",
"civics",
"cladistics",
"cryogenics",
"cryonics",
"cryptanalytics",
"cybernetics",
"cytoarchitectonics",
"cytogenetics",
"diagnostics",
"dietetics",
"dramatics",
"dysgenics",
"econometrics",
"economics",
"electromagnetics",
"electronics",
"electrostatics",
"endodontics",
"enterics",
"ergonomics",
"eugenics",
"eurhythmics",
"eurythmics",
"exodontics",
"fibreoptics",
"futuristics",
"genetics",
"genomics",
"geographics",
"geophysics",
"geopolitics",
"geriatrics",
"glyptics",
"graphics",
"gymnastics",
"hermeneutics",
"histrionics",
"homiletics",
"hydraulics",
"hydrodynamics",
"hydrokinetics",
"hydroponics",
"hydrostatics",
"hygienics",
"informatics",
"kinematics",
"kinesthetics",
"kinetics",
"lexicostatistics",
"linguistics",
"lithoglyptics",
"liturgics",
"logistics",
"macrobiotics",
"macroeconomics",
"magnetics",
"magnetohydrodynamics",
"mathematics",
"metamathematics",
"metaphysics",
"microeconomics",
"microelectronics",
"mnemonics",
"morphophonemics",
"neuroethics",
"neurolinguistics",
"nucleonics",
"numismatics",
"obstetrics",
"onomastics",
"orthodontics",
"orthopaedics",
"orthopedics",
"orthoptics",
"paediatrics",
"patristics",
"patristics",
"pedagogics",
"pediatrics",
"periodontics",
"pharmaceutics",
"pharmacogenetics",
"pharmacokinetics",
"phonemics",
"phonetics",
"phonics",
"photomechanics",
"physiatrics",
"pneumatics",
"poetics",
"politics",
"pragmatics",
"prosthetics",
"prosthodontics",
"proteomics",
"proxemics",
"psycholinguistics",
"psychometrics",
"psychonomics",
"psychophysics",
"psychotherapeutics",
"robotics",
"semantics",
"semiotics",
"semitropics",
"sociolinguistics",
"stemmatics",
"strategics",
"subtropics",
"systematics",
"tectonics",
"telerobotics",
"therapeutics",
"thermionics",
"thermodynamics",
"thermostatics");
/**
* Words that change from "-ie" to "-ies" (like "auntie" etc.), listed in
* their plural forms
*/
public Set<String> categoryIE_IES = new FinalSet<String>(
"aeries",
"anomies",
"aunties",
"baddies",
"beanies",
"birdies",
"boccies",
"bogies",
"bolshies",
"bombies",
"bonhomies",
"bonxies",
"booboisies",
"boogies",
"boogie-woogies",
"bookies",
"booties",
"bosies",
"bourgeoisies",
"brasseries",
"brassies",
"brownies",
"budgies",
"byrnies",
"caddies",
"calories",
"camaraderies",
"capercaillies",
"capercailzies",
"cassies",
"catties",
"causeries",
"charcuteries",
"chinoiseries",
"collies",
"commies",
"cookies",
"coolies",
"coonties",
"cooties",
"corries",
"coteries",
"cowpies",
"cowries",
"cozies",
"crappies",
"crossties",
"curies",
"dachsies",
"darkies",
"dassies",
"dearies",
"dickies",
"dies",
"dixies",
"doggies",
"dogies",
"dominies",
"dovekies",
"eyries",
"faeries",
"falsies",
"floozies",
"folies",
"foodies",
"freebies",
"gaucheries",
"gendarmeries",
"genies",
"ghillies",
"gillies",
"goalies",
"goonies",
"grannies",
"grotesqueries",
"groupies",
"hankies",
"hippies",
"hoagies",
"honkies",
"hymies",
"indies",
"junkies",
"kelpies",
"kilocalories",
"knobkerries",
"koppies",
"kylies",
"laddies",
"lassies",
"lies",
"lingeries",
"magpies",
"magpies",
"marqueteries",
"mashies",
"mealies",
"meanies",
"menageries",
"millicuries",
"mollies",
"facts1",
"moxies",
"neckties",
"newbies",
"nighties",
"nookies",
"oldies",
"organdies",
"panties",
"parqueteries",
"passementeries",
"patisseries",
"pies",
"pinkies",
"pixies",
"porkpies",
"potpies",
"prairies",
"preemies",
"premies",
"punkies",
"pyxies",
"quickies",
"ramies",
"reveries",
"rookies",
"rotisseries",
"scrapies",
"sharpies",
"smoothies",
"softies",
"stoolies",
"stymies",
"swaggies",
"sweeties",
"talkies",
"techies",
"ties",
"tooshies",
"toughies",
"townies",
"veggies",
"walkie-talkies",
"wedgies",
"weenies",
"weirdies",
"yardies",
"yuppies",
"zombies");
/**
* Maps irregular Germanic English plural nouns to their singular form
*/
public Map<String, String> irregular = new FinalMap<String, String>(
"beefs", "beef",
"beeves", "beef",
"brethren", "brother",
"busses", "bus",
"cattle", "cattlebeast",
"children", "child",
"corpora", "corpus",
"ephemerides", "ephemeris",
"firemen", "fireman",
"genera", "genus",
"genies", "genie",
"genii", "genie",
"kine", "cow",
"lice", "louse",
"men", "man",
"mice", "mouse",
"mongooses", "mongoose",
"monies", "money",
"mythoi", "mythos",
"octopodes", "octopus",
"octopuses", "octopus",
"oxen", "ox",
"people", "person",
"soliloquies", "soliloquy",
"throes", "throes",
"trilbys", "trilby",
"women", "woman");
/**
* Contains word forms that can either be plural or singular
*/
public Set<String> singAndPlur = new FinalSet<String>(
"acoustics",
"aestetics",
"aquatics",
"basics",
"ceramics",
"classics",
"cosmetics",
"dermatoglyphics",
"dialectics",
"dynamics",
"esthetics",
"ethics",
"harmonics",
"heroics",
"isometrics",
"mechanics",
"metrics",
"statistics",
"optic",
"people",
"physics",
"polemics",
"premises",
"propaedeutics",
"pyrotechnics",
"quadratics",
"quarters",
"statistics",
"tactics",
"tropics");
/**
* Test routine
*/
// public static void main(String[] argv) throws Exception {
// System.out.println("Enter an English word in plural form and press ENTER");
// BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
// PlingStemmer stemmer = new PlingStemmer();
// while (true) {
// String w = in.readLine();
// if (w.length() == 0) {
// break;
// }
// if (stemmer.isPlural(w)) {
// System.out.println("This word is plural");
// }
// if (stemmer.isSingular(w)) {
// System.out.println("This word is singular");
// }
// System.out.println("Stemmed to singular: " + stemmer.stem(w));
// }
// }
}