/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.kea.filter;
import java.util.StringTokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.filters.Filter;
/**
* Removes all numbers from all the string attributes in the given
* dataset. Assumes that words are separated by whitespace.
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @version 1.0
*/
public class NumbersFilter extends Filter {
private static Logger log = LoggerFactory.getLogger(NumbersFilter.class);
private static final long serialVersionUID = 1L;
/**
* Returns a string describing this filter
*
* @return a description of the filter suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Removes all numbers from all the string attributes in " +
"the given dataset. Assumes that words are separated by whitespace.";
}
/**
* Signify that this batch of input to the filter is finished. If
* the filter requires all instances prior to filtering, output()
* may now be called to retrieve the filtered instances. Any
* subsequent instances filtered should be filtered based on setting
* obtained from the first batch (unless the inputFormat has been
* re-assigned or new options have been set). This default
* implementation assumes all instance processing occurs during
* inputFormat() and input().
*
* @return true if there are instances pending output
* @exception NullPointerException if no input structure has been defined,
* @exception Exception if there was a problem finishing the batch.
*/
public boolean batchFinished() throws Exception {
if (getInputFormat() == null) {
throw new NullPointerException("No input instance format defined");
}
m_NewBatch = true;
return (numPendingOutput() != 0);
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input
* instance structure (any instances contained in the object are
* ignored - only the structure is required).
* @return true if the outputFormat may be collected immediately
*/
public boolean setInputFormat(Instances instanceInfo) throws Exception {
super.setInputFormat(instanceInfo);
setOutputFormat(instanceInfo);
return true;
}
/**
* Returns the Capabilities of this filter.
*
* @return the capabilities of this object
* @see Capabilities
*/
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// attributes
result.enableAllAttributes();
result.enable(Capability.MISSING_VALUES);
// class
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.NO_CLASS);
return result;
}
/**
* Input an instance for filtering. Ordinarily the instance is processed
* and made available for output immediately. Some filters require all
* instances be read before producing output.
*
* @param instance the input instance
* @return true if the filtered instance may now be
* collected with output().
* @exception Exception if the input instance was not of the correct
* format or if there was a problem with the filtering.
*/
public boolean input(Instance instance) throws Exception {
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
convertInstance(instance);
return true;
}
/**
* Main method for testing this class.
*
* @param argv should contain arguments to the filter: use -h for help
*/
public static void main(String [] argv) {
try {
if (Utils.getFlag('b', argv)) {
Filter.batchFilterFile(new NumbersFilter(), argv);
} else {
Filter.filterFile(new NumbersFilter(), argv);
}
} catch (Exception ex) {
log.info(ex.getMessage());
}
}
/**
* Converts an instance. A phrase boundary is inserted where
* a number is found.
*/
private void convertInstance(Instance instance) throws Exception {
double[] instVals = new double[instance.numAttributes()];
for (int i = 0; i < instance.numAttributes(); i++) {
if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
instVals[i] = instance.value(i);
} else {
String str = instance.stringValue(i);
StringBuffer resultStr = new StringBuffer();
StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
while (tok.hasMoreTokens()) {
String token = tok.nextToken();
// Everything that doesn't contain at least
// one letter is considered to be a number
boolean isNumber = true;
for (int j = 0; j < token.length(); j++) {
if (Character.isLetter(token.charAt(j))) {
isNumber = false;
break;
}
}
if (!isNumber) {
resultStr.append(token);
} else {
if (token.equals(" ") || token.equals("\t") ||
token.equals("\n")) {
resultStr.append(token);
} else {
resultStr.append(" \n ");
}
}
}
int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
instVals[i] = (double)index;
}
}
Instance inst = new Instance(instance.weight(), instVals);
inst.setDataset(getOutputFormat());
push(inst);
}
}