package weka.datagenerators; import weka.core.Option; import weka.core.Utils; import java.util.ArrayList; import java.util.Collection; /** * Tosses tokens that are outside a certain range of length. * * <p><b>WEKA options:</b> * <ul> * <li><code>-N <num></code> - Specifies the minimum length. * This parameter has no default value. * * <li><code>-X <num></code> - Specifies the maximum length. * This parameter has no default value. * </ul> * * <p>It is a fatal error to leave both of the parameters unspecified. * * @author ywwong * @version $Id: WordLengthFilter.java,v 1.1.1.1 2003/01/22 07:48:27 mbilenko Exp $ */ class WordLengthFilter implements TokenFilter { /** Words that are shorter than this will be ignored. */ protected int m_nMin; /** Words that are longer than this will be ignored. */ protected int m_nMax; ////// WEKA specific. ////// /** The option string for minimum length. */ protected String m_strMin; /** The option string for maximum length. */ protected String m_strMax; ////// Ends WEKA specific. ////// /** * Creates a word length filter. * * @param ts The TextSource object. */ public WordLengthFilter(TextSource ts, String[] options) throws Exception { int nMin; int nMax; Integer n; ////// WEKA specific. ////// m_strMin = Utils.getOption('N', options); if (m_strMin.length() == 0) nMin = -1; else { n = Integer.valueOf(m_strMin); if (n == null || n.intValue() < 0) throw new Exception("Invalid minimum width (-N)."); else nMin = n.intValue(); } m_strMax = Utils.getOption('X', options); if (m_strMax.length() == 0) nMax = -1; else { n = Integer.valueOf(m_strMax); if (n == null || n.intValue() < 0) throw new Exception("Invalid maximum width (-X)."); else nMax = n.intValue(); } ////// Ends WEKA specific. ////// if (nMin < 0 && nMax < 0) throw new Exception("Neither widths are set (-X or -N)."); if (nMin >= 0 && nMax >= 0 && nMin > nMax) throw new Exception("Invalid range (-X and -N)."); m_nMin = nMin; m_nMax = nMax; } /** Tosses tokens that are shorter than the minimum length. * * @param strToken The input token * @return The input token; <code>null</code> if the length of the * input token is out of range */ public String apply(String strToken) { int nLen; nLen = strToken.length(); if ((m_nMin >= 0 && nLen < m_nMin) || (m_nMax >= 0 && nLen > m_nMax)) return null; else return strToken; } ////// WEKA specific. ////// public static Collection listOptions() { ArrayList aOpts; aOpts = new ArrayList(); aOpts.add(new Option("\tWordLengthFilter: Minimum length " + "(default unbounded)", "N", 1, "-N <num>")); aOpts.add(new Option("\tWordLengthFilter: Maximum length " + "(default unbounded)", "X", 1, "-X <num>")); return aOpts; } public Collection getOptions() { ArrayList aOpts; aOpts = new ArrayList(); if (m_strMin.length() > 0) { aOpts.add("-N"); aOpts.add(m_strMin); } if (m_strMax.length() > 0) { aOpts.add("-X"); aOpts.add(m_strMax); } return aOpts; } }