package edu.umd.rhsmith.diads.tools.tfidf; /** * <p> * A default implementation of {@link TermCleaner}. Cleaned text is converted to * lowercase with all non-alphanumeric characters removed, preserving whitespace * boundaries between words. The cleaning operation used is also available as a * static method via {@link #clean(String)}. * </p> * * @author rmachedo * */ public class DefaultTermCleaner implements TermCleaner { @Override public String clean(String analysisText) { return cleanText(analysisText); } /** * Return the given text converted to lowercase with all * non-alphanumeric characters removed, * preserving whitespace boundaries between words * * @param analysisText * @return */ public static String cleanText(String analysisText) { return analysisText.replaceAll("[^\\w\\d\\s]", "").toLowerCase(); } }