package edu.stanford.nlp.process; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.ling.WordLemmaTag; import edu.stanford.nlp.ling.WordTag; import edu.stanford.nlp.util.Function; import edu.stanford.nlp.util.StringUtils; /** * Morphology computes the base form of English words, by removing just * inflections (not derivational morphology). That is, it only does noun * plurals, pronoun case, and verb endings, and not things like comparative adjectives * or derived nominals. It is based on a finite-state * transducer implemented by John Carroll et al., written in flex and publicly * available. * See: http://www.informatics.susx.ac.uk/research/nlp/carroll/morph.html . * There are several ways of invoking Morphology. One is by calling the static * methods * WordTag stemStatic(String word, String tag) or * WordTag stemStatic(WordTag wordTag). * If we have created a Morphology object already we can use the methods * WordTag stem(String word, string tag) or WordTag stem(WordTag wordTag). * <p/> * Another way of using Morphology is to run it on an input file by running * <code>java Morphology filename</code>. In this case, POS tags must be * separated from words by an underscore ("_"). * * @author Kristina Toutanova (kristina@cs.stanford.edu) * @author Christopher Manning */ public class Morphology implements Function { // todo: The main method of this class no longer works. If the tag separator isn't _, it errors, if it is, it doesn't correctly use the POS to do lemmatization private static final boolean DEBUG = false; private Morpha lexer; private static Morpha staticLexer; public Morphology() { lexer = new Morpha(System.in); } /** * Process morphologically words from a Reader. * * @param in The Reader to read from */ public Morphology(Reader in) { lexer = new Morpha(in); } public Morphology(String filename) { try { lexer = new Morpha(new FileReader(filename)); } catch (Exception e) { e.printStackTrace(); } } public Word next() throws IOException { String nx = lexer.next(); if (nx == null) { return null; } else { return new Word(nx); } } static boolean isProper(String posTag) { return posTag.equals("NNP") || posTag.equals("NNPS") || posTag.equals("NP"); } public Word stem(Word w) { try { lexer.yyreset(new StringReader(w.value())); lexer.yybegin(Morpha.any); String wordRes = lexer.next(); return new Word(wordRes); } catch (Exception e) { e.printStackTrace(); } return w; } public String stem(String word) { try { lexer.yyreset(new StringReader(word)); lexer.yybegin(Morpha.any); String wordRes = lexer.next(); return wordRes; } catch (Exception e) { e.printStackTrace(); } return word; } public WordTag stem(WordTag wT) { return stem(wT.word(), wT.tag()); } public WordTag stem(String word, String tag) { return stem(word, tag, lexer, lexer.option(1)); } /** Lemmatize the word, being sensitive to the tag, using the * passed in lexer. * * @param lowercase If this is true, words other than proper nouns will * be changed to all lowercase. */ // XXX why does this return a WordTag, and not just a String? public static WordTag stem(String word, String tag, Morpha lexer, boolean lowercase) { boolean wordHasForbiddenChar = word.indexOf('_') >= 0 || word.indexOf(' ') >= 0; String quotedWord = word; if (wordHasForbiddenChar) { try { // choose something unlikely. Devangari! quotedWord = quotedWord.replaceAll("_", "\u0960"); quotedWord = quotedWord.replaceAll(" ", "\u0961"); } catch (Exception e) { System.err.println("stem: Didn't work"); } } String wordtag = quotedWord + "_" + tag; if (DEBUG) System.err.println("Trying to normalize |" + wordtag + "|"); try { lexer.setOption(1, lowercase); lexer.yyreset(new StringReader(wordtag)); lexer.yybegin(Morpha.scan); String wordRes = lexer.next(); lexer.next(); // go past tag if (wordHasForbiddenChar) { try { if (DEBUG) System.err.println("Restoring forbidden chars"); wordRes = wordRes.replaceAll("\u0960", "_"); wordRes = wordRes.replaceAll("\u0961", " "); } catch (Exception e) { System.err.println("stem: Didn't work"); } } return new WordTag(wordRes, tag); } catch (Throwable e) { System.err.println("Morphology.stem() had error on word " + word + "/" + tag); if (DEBUG) e.printStackTrace(); return new WordTag(word, tag); } } private static synchronized void initStaticLexer() { if (staticLexer == null) { staticLexer = new Morpha(System.in); } } /** Return a new WordTag which has the lemma as the value of word(). * The default is to lowercase non-proper-nouns, unless options have * been set. */ public static WordTag stemStatic(String word, String tag) { initStaticLexer(); return stem(word, tag, staticLexer, staticLexer.option(1)); } public static WordTag stemStatic(String word, String tag, boolean lowercase) { initStaticLexer(); return stem(word, tag, staticLexer, lowercase); } public synchronized static WordTag stemStaticSynchronized(String word, String tag) { return stemStatic(word, tag); } public synchronized static WordTag stemStaticSynchronized(String word, String tag, boolean lowercase) { return stemStatic(word, tag, lowercase); } /** Return a new WordTag which has the lemma as the value of word(). * The default is to lowercase non-proper-nouns, unless options have * been set. */ public static WordTag stemStatic(WordTag wT) { return stemStatic(wT.word(), wT.tag()); } public Object apply(Object in) { if (in instanceof WordTag) { return stem((WordTag) in); } if (in instanceof Word) { return stem((Word) in); } return in; } /** * Lemmatize returning a <code>WordLemmaTag </code>. */ public WordLemmaTag lemmatize(WordTag wT) { String tag = wT.tag(); String word = wT.word(); String lemma = stem(wT).word(); return new WordLemmaTag(word, lemma, tag); } public static WordLemmaTag lemmatizeStatic(WordTag wT) { String tag = wT.tag(); String word = wT.word(); String lemma = stemStatic(wT).word(); return new WordLemmaTag(word, lemma, tag); } /** Run the morphological analyzer. Options are: * <ul> * <li>-rebuildVerbTable verbTableFile Convert a verb table from a text file * (e.g., /u/nlp/data/morph/verbstem.list) to Java code contained in Morpha.flex . * <li>-stem args ... Stem each of the following arguments, which should either be * in the form of just word or word/tag. * <li> args ... Each argument is a file and the contents of it are stemmed as * space-separated tokens. <i>Note:</i> If the tokens are tagged * words, they must be in the format of whitespace separated word_tag pairs. */ public static void main(String[] args) throws IOException { if (args.length == 0) { System.err.println("java Morphology [-rebuildVerbTable file|-stem word+|file+]"); } else if (args.length == 2 && args[0].equals("-rebuildVerbTable")) { String verbs = StringUtils.slurpFile(args[1]); String[] words = verbs.split("\\s+"); System.out.print(" private static String[] verbStems = new String[] { "); for (int i = 0; i < words.length; i++) { System.out.print("\"" + words[i] + "\""); if (i != words.length - 1) { System.out.print(", "); if (i % 5 == 0) { System.out.println(); System.out.print(" "); } } } System.out.println(" };"); } else if (args[0].equals("-stem")) { for (int i = 1; i < args.length; i++) { System.out.println(args[i] + " --> " + stemStatic(WordTag.valueOf(args[i]))); } } else { for (String arg : args) { Morphology morph = new Morphology(arg); for (Word next; (next = morph.next()) != null; ) { System.out.print(next); System.out.print(" "); } } } } }