/* * SimpleStemmer.java * Copyright (C) 2007 Olena Medelyan, olena@cs.waikato.ac.nz * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.util.text; /** * A basic stemmer that only performs the first step of the * PorterStemmer algorithm: removing of the plural endings. * @author olena * */ public class SimpleStemmer extends TextProcessor{ Cleaner cleaner = new Cleaner() ; /** * Returns a copy of the argument text, where each term within it is stemmed and cleaned. * * @param text the text to be processed. * @return the processed version of this text. */ public String processText(String text) { String processedText = "" ; String[] terms = cleaner.processText(text).split(" ") ; for (String term: terms) { if (!"".equals(term)) processedText = processedText + " " + stem(term) ; } return processedText.trim() ; } private String stem(String str) { // check for zero length if (str.length() > 3) { // all characters must be letters char[] c = str.toCharArray(); for (int i = 0; i < c.length; i++) { if (!Character.isLetter(c[i])) { return str.toLowerCase(); } } } else { return str.toLowerCase(); } str = step1a(str); return str.toLowerCase(); } private String step1a (String str) { // SSES -> SS if (str.endsWith("sses")) { return str.substring(0, str.length() - 2); // IES -> x (families -> famil) } else if (str.endsWith("ies")) { return str.substring(0, str.length() - 3).concat("y"); // SS -> S } else if (str.endsWith("ss")) { return str; // S -> } else if (str.endsWith("s")) { return str.substring(0, str.length() - 1); } else { return str; } } }