/** * Copyright (C) 2010-2017 Structr GmbH * * This file is part of Structr <http://structr.org>. * * Structr is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * Structr is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Structr. If not, see <http://www.gnu.org/licenses/>. */ package javatools.parsers; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.Map; import java.util.Set; import javatools.datatypes.FinalMap; import javatools.datatypes.FinalSet; /** This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). It is licensed under the Creative Commons Attribution License (see http://creativecommons.org/licenses/by/3.0) by the YAGO-NAGA team (see http://mpii.de/yago-naga). The PlingStemmer stems an English noun (plural or singular) to its singular form. It deals with "firemen"->"fireman", it knows Greek stuff like "appendices"->"appendix" and yes, it was a lot of work to compile these exceptions. Examples: <PRE> System.out.println(PlingStemmer.stem("boy")); ----> boy System.out.println(PlingStemmer.stem("boys")); ----> boy System.out.println(PlingStemmer.stem("biophysics")); ----> biophysics System.out.println(PlingStemmer.stem("automata")); ----> automaton System.out.println(PlingStemmer.stem("genus")); ----> genus System.out.println(PlingStemmer.stem("emus")); ----> emu </PRE><P> There are a number of word forms that can either be plural or singular. Examples include "physics" (the science or the plural of "physic" (the medicine)), "quarters" (the housing or the plural of "quarter" (1/4)) or "people" (the singular of "peoples" or the plural of "person"). In these cases, the stemmer assumes the word is a plural form and returns the singular form. The methods isPlural, isSingular and isPluralAndSingular can be used to differentiate the cases.<P> It cannot be guaranteed that the stemmer correctly stems a plural word or correctly ignores a singular word -- let alone that it treats an ambiguous word form in the way expected by the user.<P> The PlingStemmer uses material from <A HREF=http://wordnet.princeton.edu/>WordNet</A>.<P> It requires the class FinalSet from the <A HREF=http://www.mpii.mpg.de/~suchanek/downloads/javatools> Java Tools</A>. */ public class PlingStemmer { /** Tells whether a word form is plural. This method just checks whether the * stem method alters the word */ public static boolean isPlural(String s) { return(!s.equals(stem(s))); } /** Tells whether a word form is singular. Note that a word can be both plural and singular */ public static boolean isSingular(String s) { return(singAndPlur.contains(s.toLowerCase()) || !isPlural(s)); } /** Tells whether a word form is the singular form of one word and at * the same time the plural form of another.*/ public static boolean isSingularAndPlural(String s) { return(singAndPlur.contains(s.toLowerCase())); } /** Cuts a suffix from a string (that is the number of chars given by the suffix) */ public static String cut(String s, String suffix) { return(s.substring(0,s.length()-suffix.length())); } /** Returns true if a word is probably not Latin */ public static boolean noLatin(String s) { return(s.indexOf('h')>0 || s.indexOf('j')>0 || s.indexOf('k')>0 || s.indexOf('w')>0 || s.indexOf('y')>0 || s.indexOf('z')>0 || s.indexOf("ou")>0 || s.indexOf("sh")>0 || s.indexOf("ch")>0 || s.endsWith("aus")); } /** Returns true if a word is probably Greek */ private static boolean greek(String s) { return(s.indexOf("ph")>0 || s.indexOf('y')>0 && s.endsWith("nges")); } /** Stems an English noun */ public static String stem(String s) { String stem = s; // Handle irregular ones String irreg=irregular.get(s); if(irreg!=null) return(stem=irreg); // -on to -a if(categoryON_A.contains(s)) return(stem=cut(s,"a")+"on"); // -um to -a if(categoryUM_A.contains(s)) return(stem=cut(s,"a")+"um"); // -x to -ices if(categoryIX_ICES.contains(s)) return(stem=cut(s,"ices")+"ix"); // -o to -i if(categoryO_I.contains(s)) return(stem=cut(s,"i")+"o"); // -se to ses if(categorySE_SES.contains(s)) return(stem=cut(s,"s")); // -is to -es if(categoryIS_ES.contains(s) || s.endsWith("theses")) return(stem=cut(s,"es")+"is"); // -us to -i if(categoryUS_I.contains(s)) return(stem=cut(s,"i")+"us"); //Wrong plural if(s.endsWith("uses") && (categoryUS_I.contains(cut(s,"uses")+"i") || s.equals("genuses") || s.equals("corpuses"))) return(stem=cut(s,"es")); // -ex to -ices if(categoryEX_ICES.contains(s)) return(stem=cut(s,"ices")+"ex"); // Words that do not inflect in the plural if(s.endsWith("ois") || s.endsWith("itis") || category00.contains(s) || categoryICS.contains(s)) return(stem=s); // -en to -ina // No other common words end in -ina if(s.endsWith("ina")) return(stem=cut(s,"en")); // -a to -ae // No other common words end in -ae if(s.endsWith("ae")) return(stem=cut(s,"e")); // -a to -ata // No other common words end in -ata if(s.endsWith("ata")) return(stem=cut(s,"ta")); // trix to -trices // No common word ends with -trice(s) if(s.endsWith("trices")) return(stem=cut(s,"trices")+"trix"); // -us to -us //No other common word ends in -us, except for false plurals of French words //Catch words that are not latin or known to end in -u if(s.endsWith("us") && !s.endsWith("eaus") && !s.endsWith("ieus") && !noLatin(s) && !categoryU_US.contains(s)) return(stem=s); // -tooth to -teeth // -goose to -geese // -foot to -feet // -zoon to -zoa //No other common words end with the indicated suffixes if(s.endsWith("teeth")) return(stem=cut(s,"teeth")+"tooth"); if(s.endsWith("geese")) return(stem=cut(s,"geese")+"goose"); if(s.endsWith("feet")) return(stem=cut(s,"feet")+"foot"); if(s.endsWith("zoa")) return(stem=cut(s,"zoa")+"zoon"); // -eau to -eaux //No other common words end in eaux if(s.endsWith("eaux")) return(stem=cut(s,"x")); // -ieu to -ieux //No other common words end in ieux if(s.endsWith("ieux")) return(stem=cut(s,"x")); // -nx to -nges // Pay attention not to kill words ending in -nge with plural -nges // Take only Greek words (works fine, only a handfull of exceptions) if(s.endsWith("nges") && greek(s)) return(stem=cut(s,"nges")+"nx"); // -[sc]h to -[sc]hes //No other common word ends with "shes", "ches" or "she(s)" //Quite a lot end with "che(s)", filter them out if(s.endsWith("shes") || s.endsWith("ches") && !categoryCHE_CHES.contains(s)) return(stem=cut(s,"es")); // -ss to -sses // No other common singular word ends with "sses" // Filter out those ending in "sse(s)" if(s.endsWith("sses") && !categorySSE_SSES.contains(s) && !s.endsWith("mousses")) return(stem=cut(s,"es")); // -x to -xes // No other common word ends with "xe(s)" except for "axe" if(s.endsWith("xes") && !s.equals("axes")) return(stem=cut(s,"es")); // -[nlw]ife to -[nlw]ives //No other common word ends with "[nlw]ive(s)" except for olive if(s.endsWith("nives") || s.endsWith("lives") && !s.endsWith("olives") || s.endsWith("wives")) return(stem=cut(s,"ves")+"fe"); // -[aeo]lf to -ves exceptions: valve, solve // -[^d]eaf to -ves exceptions: heave, weave // -arf to -ves no exception if(s.endsWith("alves") && !s.endsWith("valves") || s.endsWith("olves") && !s.endsWith("solves") || s.endsWith("eaves") && !s.endsWith("heaves") && !s.endsWith("weaves") || s.endsWith("arves") ) return(stem=cut(s,"ves")+"f"); // -y to -ies // -ies is very uncommon as a singular suffix // but -ie is quite common, filter them out if(s.endsWith("ies") && !categoryIE_IES.contains(s)) return(stem=cut(s,"ies")+"y"); // -o to -oes // Some words end with -oe, so don't kill the "e" if(s.endsWith("oes") && !categoryOE_OES.contains(s)) return(stem=cut(s,"es")); // -s to -ses // -z to -zes // no words end with "-ses" or "-zes" in singular if(s.endsWith("ses") || s.endsWith("zes") ) return(stem=cut(s,"es")); // - to -s if(s.endsWith("s") && !s.endsWith("ss") && !s.endsWith("is")) return(stem=cut(s,"s")); return stem; } /** Words that end in "-se" in their plural forms (like "nurse" etc.)*/ public static Set<String> categorySE_SES=new FinalSet<String>( "nurses", "cruises", "premises", "houses", "courses", "cases" ); /** Words that do not have a distinct plural form (like "atlas" etc.)*/ public static Set<String> category00=new FinalSet<String>( "alias", "asbestos", "atlas", "barracks", "bathos", "bias", "breeches", "britches", "canvas", "chaos", "clippers", "contretemps", "corps", "cosmos", "crossroads", "diabetes", "ethos", "gallows", "gas", "graffiti", "headquarters", "herpes", "high-jinks", "innings", "jackanapes", "lens", "means", "measles", "mews", "mumps", "news", "pathos", "pincers", "pliers", "proceedings", "rabies", "rhinoceros", "sassafras", "scissors", "series", "shears", "species", "tuna" ); /** Words that change from "-um" to "-a" (like "curriculum" etc.), listed in their plural forms*/ public static Set<String> categoryUM_A=new FinalSet<String>( "addenda", "agenda", "aquaria", "bacteria", "candelabra", "compendia", "consortia", "crania", "curricula", "data", "desiderata", "dicta", "emporia", "enconia", "errata", "extrema", "gymnasia", "honoraria", "interregna", "lustra", "maxima", "media", "memoranda", "millenia", "minima", "momenta", "optima", "ova", "phyla", "quanta", "rostra", "spectra", "specula", "stadia", "strata", "symposia", "trapezia", "ultimata", "vacua", "vela" ); /** Words that change from "-on" to "-a" (like "phenomenon" etc.), listed in their plural forms*/ public static Set<String> categoryON_A=new FinalSet<String>( "aphelia", "asyndeta", "automata", "criteria", "hyperbata", "noumena", "organa", "perihelia", "phenomena", "prolegomena" ); /** Words that change from "-o" to "-i" (like "libretto" etc.), listed in their plural forms*/ public static Set<String> categoryO_I=new FinalSet<String>( "alti", "bassi", "canti", "contralti", "crescendi", "libretti", "soli", "soprani", "tempi", "virtuosi" ); /** Words that change from "-us" to "-i" (like "fungus" etc.), listed in their plural forms*/ public static Set<String> categoryUS_I=new FinalSet<String>( "alumni", "bacilli", "cacti", "foci", "fungi", "genii", "hippopotami", "incubi", "nimbi", "nuclei", "nucleoli", "octopi", "radii", "stimuli", "styli", "succubi", "syllabi", "termini", "tori", "umbilici", "uteri" ); /** Words that change from "-ix" to "-ices" (like "appendix" etc.), listed in their plural forms*/ public static Set<String> categoryIX_ICES=new FinalSet<String>( "appendices", "cervices" ); /** Words that change from "-is" to "-es" (like "axis" etc.), listed in their plural forms*/ public static Set<String> categoryIS_ES=new FinalSet<String>( // plus everybody ending in theses "analyses", "axes", "bases", "crises", "diagnoses", "ellipses", "emphases", "neuroses", "oases", "paralyses", "synopses" ); /** Words that change from "-oe" to "-oes" (like "toe" etc.), listed in their plural forms*/ public static Set<String> categoryOE_OES=new FinalSet<String>( "aloes", "backhoes", "beroes", "canoes", "chigoes", "cohoes", "does", "felloes", "floes", "foes", "gumshoes", "hammertoes", "hoes", "hoopoes", "horseshoes", "leucothoes", "mahoes", "mistletoes", "oboes", "overshoes", "pahoehoes", "pekoes", "roes", "shoes", "sloes", "snowshoes", "throes", "tic-tac-toes", "tick-tack-toes", "ticktacktoes", "tiptoes", "tit-tat-toes", "toes", "toetoes", "tuckahoes", "woes" ); /** Words that change from "-ex" to "-ices" (like "index" etc.), listed in their plural forms*/ public static Set<String> categoryEX_ICES=new FinalSet<String>( "apices", "codices", "cortices", "indices", "latices", "murices", "pontifices", "silices", "simplices", "vertices", "vortices" ); /** Words that change from "-u" to "-us" (like "emu" etc.), listed in their plural forms*/ public static Set<String> categoryU_US=new FinalSet<String>( "apercus", "barbus", "cornus", "ecrus", "emus", "fondus", "gnus", "iglus", "mus", "nandus", "napus", "poilus", "quipus", "snafus", "tabus", "tamandus", "tatus", "timucus", "tiramisus", "tofus", "tutus" ); /** Words that change from "-sse" to "-sses" (like "finesse" etc.), listed in their plural forms*/ public static Set<String> categorySSE_SSES=new FinalSet<String>( //plus those ending in mousse "bouillabaisses", "coulisses", "crevasses", "crosses", "cuisses", "demitasses", "ecrevisses", "fesses", "finesses", "fosses", "impasses", "lacrosses", "largesses", "masses", "noblesses", "palliasses", "pelisses", "politesses", "posses", "tasses", "wrasses" ); /** Words that change from "-che" to "-ches" (like "brioche" etc.), listed in their plural forms*/ public static Set<String> categoryCHE_CHES=new FinalSet<String>( "adrenarches", "attaches", "avalanches", "barouches", "brioches", "caches", "caleches", "caroches", "cartouches", "cliches", "cloches", "creches", "demarches", "douches", "gouaches", "guilloches", "headaches", "heartaches", "huaraches", "menarches", "microfiches", "moustaches", "mustaches", "niches", "panaches", "panoches", "pastiches", "penuches", "pinches", "postiches", "psyches", "quiches", "schottisches", "seiches", "soutaches", "synecdoches", "thelarches", "troches" ); /** Words that end with "-ics" and do not exist as nouns without the 's' (like "aerobics" etc.)*/ public static Set<String> categoryICS=new FinalSet<String>( "aerobatics", "aerobics", "aerodynamics", "aeromechanics", "aeronautics", "alphanumerics", "animatronics", "apologetics", "architectonics", "astrodynamics", "astronautics", "astrophysics", "athletics", "atmospherics", "autogenics", "avionics", "ballistics", "bibliotics", "bioethics", "biometrics", "bionics", "bionomics", "biophysics", "biosystematics", "cacogenics", "calisthenics", "callisthenics", "catoptrics", "civics", "cladistics", "cryogenics", "cryonics", "cryptanalytics", "cybernetics", "cytoarchitectonics", "cytogenetics", "diagnostics", "dietetics", "dramatics", "dysgenics", "econometrics", "economics", "electromagnetics", "electronics", "electrostatics", "endodontics", "enterics", "ergonomics", "eugenics", "eurhythmics", "eurythmics", "exodontics", "fibreoptics", "futuristics", "genetics", "genomics", "geographics", "geophysics", "geopolitics", "geriatrics", "glyptics", "graphics", "gymnastics", "hermeneutics", "histrionics", "homiletics", "hydraulics", "hydrodynamics", "hydrokinetics", "hydroponics", "hydrostatics", "hygienics", "informatics", "kinematics", "kinesthetics", "kinetics", "lexicostatistics", "linguistics", "lithoglyptics", "liturgics", "logistics", "macrobiotics", "macroeconomics", "magnetics", "magnetohydrodynamics", "mathematics", "metamathematics", "metaphysics", "microeconomics", "microelectronics", "mnemonics", "morphophonemics", "neuroethics", "neurolinguistics", "nucleonics", "numismatics", "obstetrics", "onomastics", "orthodontics", "orthopaedics", "orthopedics", "orthoptics", "paediatrics", "patristics", "patristics", "pedagogics", "pediatrics", "periodontics", "pharmaceutics", "pharmacogenetics", "pharmacokinetics", "phonemics", "phonetics", "phonics", "photomechanics", "physiatrics", "pneumatics", "poetics", "politics", "pragmatics", "prosthetics", "prosthodontics", "proteomics", "proxemics", "psycholinguistics", "psychometrics", "psychonomics", "psychophysics", "psychotherapeutics", "robotics", "semantics", "semiotics", "semitropics", "sociolinguistics", "stemmatics", "strategics", "subtropics", "systematics", "tectonics", "telerobotics", "therapeutics", "thermionics", "thermodynamics", "thermostatics" ); /** Words that change from "-ie" to "-ies" (like "auntie" etc.), listed in their plural forms*/ public static Set<String> categoryIE_IES=new FinalSet<String>( "aeries", "anomies", "aunties", "baddies", "beanies", "birdies", "boccies", "bogies", "bolshies", "bombies", "bonhomies", "bonxies", "booboisies", "boogies", "boogie-woogies", "bookies", "booties", "bosies", "bourgeoisies", "brasseries", "brassies", "brownies", "budgies", "byrnies", "caddies", "calories", "camaraderies", "capercaillies", "capercailzies", "cassies", "catties", "causeries", "charcuteries", "chinoiseries", "collies", "commies", "cookies", "coolies", "coonties", "cooties", "corries", "coteries", "cowpies", "cowries", "cozies", "crappies", "crossties", "curies", "dachsies", "darkies", "dassies", "dearies", "dickies", "dies", "dixies", "doggies", "dogies", "dominies", "dovekies", "eyries", "faeries", "falsies", "floozies", "folies", "foodies", "freebies", "gaucheries", "gendarmeries", "genies", "ghillies", "gillies", "goalies", "goonies", "grannies", "grotesqueries", "groupies", "hankies", "hippies", "hoagies", "honkies", "hymies", "indies", "junkies", "kelpies", "kilocalories", "knobkerries", "koppies", "kylies", "laddies", "lassies", "lies", "lingeries", "magpies", "magpies", "marqueteries", "mashies", "mealies", "meanies", "menageries", "millicuries", "mollies", "facts1", "moxies", "neckties", "newbies", "nighties", "nookies", "oldies", "organdies", "panties", "parqueteries", "passementeries", "patisseries", "pies", "pinkies", "pixies", "porkpies", "potpies", "prairies", "preemies", "premies", "punkies", "pyxies", "quickies", "ramies", "reveries", "rookies", "rotisseries", "scrapies", "sharpies", "smoothies", "softies", "stoolies", "stymies", "swaggies", "sweeties", "talkies", "techies", "ties", "tooshies", "toughies", "townies", "veggies", "walkie-talkies", "wedgies", "weenies", "weirdies", "yardies", "yuppies", "zombies" ); /** Maps irregular Germanic English plural nouns to their singular form */ public static Map<String,String> irregular=new FinalMap<String,String>( "beefs","beef", "beeves","beef", "brethren","brother", "busses","bus", "cattle","cattlebeast", "children","child", "corpora","corpus", "ephemerides","ephemeris", "firemen","fireman", "genera","genus", "genies","genie", "genii","genie", "kine","cow", "lice","louse", "men","man", "mice","mouse", "mongooses","mongoose", "monies","money", "mythoi","mythos", "octopodes","octopus", "octopuses","octopus", "oxen","ox", "people","person", "soliloquies","soliloquy", "throes","throes", "trilbys","trilby", "women","woman" ); /** Contains word forms that can either be plural or singular */ public static Set<String> singAndPlur=new FinalSet<String>( "acoustics", "aestetics", "aquatics", "basics", "ceramics", "classics", "cosmetics", "dermatoglyphics", "dialectics", "dynamics", "esthetics", "ethics", "harmonics", "heroics", "isometrics", "mechanics", "metrics", "statistics", "optic", "people", "physics", "polemics", "premises", "propaedeutics", "pyrotechnics", "quadratics", "quarters", "statistics", "tactics", "tropics" ); /** Test routine */ public static void main(String[] argv) throws Exception { System.out.println("Enter an English word in plural form and press ENTER"); BufferedReader in=new BufferedReader(new InputStreamReader(System.in)); while(true) { String w=in.readLine(); if(w.length()==0) break; if(isPlural(w)) System.out.println("This word is plural"); if(isSingular(w)) System.out.println("This word is singular"); System.out.println("Stemmed to singular: "+stem(w)); } } }