/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * SubsetByExpression.java * Copyright (C) 2008-2013 University of Waikato, Hamilton, New Zealand */ package weka.filters.unsupervised.instance; import java.util.Enumeration; import java.util.Vector; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.RevisionUtils; import weka.core.Utils; import weka.filters.SimpleBatchFilter; /** * <!-- globalinfo-start --> Filters instances according to a user-specified expression.<br/> * <br/> * Grammar:<br/> * <br/> * boolexpr_list ::= boolexpr_list boolexpr_part | boolexpr_part;<br/> * <br/> * boolexpr_part ::= boolexpr:e {: parser.setResult(e); :} ;<br/> * <br/> * boolexpr ::= BOOLEAN <br/> * | true<br/> * | false<br/> * | expr < expr<br/> * | expr <= expr<br/> * | expr > expr<br/> * | expr >= expr<br/> * | expr = expr<br/> * | ( boolexpr )<br/> * | not boolexpr<br/> * | boolexpr and boolexpr<br/> * | boolexpr or boolexpr<br/> * | ATTRIBUTE is STRING<br/> * ;<br/> * <br/> * expr ::= NUMBER<br/> * | ATTRIBUTE<br/> * | ( expr )<br/> * | opexpr<br/> * | funcexpr<br/> * ;<br/> * <br/> * opexpr ::= expr + expr<br/> * | expr - expr<br/> * | expr * expr<br/> * | expr / expr<br/> * ;<br/> * <br/> * funcexpr ::= abs ( expr )<br/> * | sqrt ( expr )<br/> * | log ( expr )<br/> * | exp ( expr )<br/> * | sin ( expr )<br/> * | cos ( expr )<br/> * | tan ( expr )<br/> * | rint ( expr )<br/> * | floor ( expr )<br/> * | pow ( expr for base , expr for exponent )<br/> * | ceil ( expr )<br/> * ;<br/> * <br/> * Notes:<br/> * - NUMBER<br/> * any integer or floating point number <br/> * (but not in scientific notation!)<br/> * - STRING<br/> * any string surrounded by single quotes; <br/> * the string may not contain a single quote though.<br/> * - ATTRIBUTE<br/> * the following placeholders are recognized for <br/> * attribute values:<br/> * - CLASS for the class value in case a class attribute is set.<br/> * - ATTxyz with xyz a number from 1 to # of attributes in the<br/> * dataset, representing the value of indexed attribute.<br/> * <br/> * Examples:<br/> * - extracting only mammals and birds from the 'zoo' UCI dataset:<br/> * (CLASS is 'mammal') or (CLASS is 'bird')<br/> * - extracting only animals with at least 2 legs from the 'zoo' UCI dataset:<br/> * (ATT14 >= 2)<br/> * - extracting only instances with non-missing 'wage-increase-second-year'<br/> * from the 'labor' UCI dataset:<br/> * not ismissing(ATT3)<br/> * <p/> * <!-- globalinfo-end --> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -E <expr> * The expression to use for filtering * (default: true). * </pre> * * <pre> * -F * Apply the filter to instances that arrive after the first * (training) batch. The default is to not apply the filter (i.e. * always return the instance) * </pre> * * <!-- options-end --> * * @author fracpete (fracpete at waikato dot ac dot nz) * @version $Revision: 9804 $ */ public class SubsetByExpression extends SimpleBatchFilter { /** for serialization. */ private static final long serialVersionUID = 5628686110979589602L; /** the expresion to use for filtering. */ protected String m_Expression = "true"; /** Whether to filter instances after the first batch has been processed */ protected boolean m_filterAfterFirstBatch = false; /** * Returns a string describing this filter. * * @return a description of the filter suitable for displaying in the explorer/experimenter gui */ @Override public String globalInfo() { return "Filters instances according to a user-specified expression.\n\n" + "Grammar:\n\n" + "boolexpr_list ::= boolexpr_list boolexpr_part | boolexpr_part;\n" + "\n" + "boolexpr_part ::= boolexpr:e {: parser.setResult(e); :} ;\n" + "\n" + "boolexpr ::= BOOLEAN \n" + " | true\n" + " | false\n" + " | expr < expr\n" + " | expr <= expr\n" + " | expr > expr\n" + " | expr >= expr\n" + " | expr = expr\n" + " | ( boolexpr )\n" + " | not boolexpr\n" + " | boolexpr and boolexpr\n" + " | boolexpr or boolexpr\n" + " | ATTRIBUTE is STRING\n" + " ;\n" + "\n" + "expr ::= NUMBER\n" + " | ATTRIBUTE\n" + " | ( expr )\n" + " | opexpr\n" + " | funcexpr\n" + " ;\n" + "\n" + "opexpr ::= expr + expr\n" + " | expr - expr\n" + " | expr * expr\n" + " | expr / expr\n" + " ;\n" + "\n" + "funcexpr ::= abs ( expr )\n" + " | sqrt ( expr )\n" + " | log ( expr )\n" + " | exp ( expr )\n" + " | sin ( expr )\n" + " | cos ( expr )\n" + " | tan ( expr )\n" + " | rint ( expr )\n" + " | floor ( expr )\n" + " | pow ( expr for base , expr for exponent )\n" + " | ceil ( expr )\n" + " ;\n" + "\n" + "Notes:\n" + "- NUMBER\n" + " any integer or floating point number \n" + " (but not in scientific notation!)\n" + "- STRING\n" + " any string surrounded by single quotes; \n" + " the string may not contain a single quote though.\n" + "- ATTRIBUTE\n" + " the following placeholders are recognized for \n" + " attribute values:\n" + " - CLASS for the class value in case a class attribute is set.\n" + " - ATTxyz with xyz a number from 1 to # of attributes in the\n" + " dataset, representing the value of indexed attribute.\n" + "\n" + "Examples:\n" + "- extracting only mammals and birds from the 'zoo' UCI dataset:\n" + " (CLASS is 'mammal') or (CLASS is 'bird')\n" + "- extracting only animals with at least 2 legs from the 'zoo' UCI dataset:\n" + " (ATT14 >= 2)\n" + "- extracting only instances with non-missing 'wage-increase-second-year'\n" + " from the 'labor' UCI dataset:\n" + " not ismissing(ATT3)\n"; } /** * SubsetByExpression may return false from input() (thus not making an instance available immediately) even after the first batch has been completed if the user has opted to apply the filter to instances after the first batch (rather than just passing them through). * * @return true this filter may remove (consume) input instances after the first batch has been completed. */ @Override public boolean mayRemoveInstanceAfterFirstBatchDone() { return true; } /** * Input an instance for filtering. Filter requires all training instances be read before producing output (calling the method batchFinished() makes the data available). If this instance is part of a new batch, m_NewBatch is set to false. * * @param instance * the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException * if no input structure has been defined * @throws Exception * if something goes wrong * @see #batchFinished() */ @Override public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) throw new IllegalStateException("No input instance format defined"); if (m_NewBatch) { resetQueue(); m_NewBatch = false; } bufferInput(instance); int numReturnedFromParser = 0; if (isFirstBatchDone()) { Instances inst = new Instances(getInputFormat()); inst = process(inst); numReturnedFromParser = inst.numInstances(); for (int i = 0; i < inst.numInstances(); i++) push(inst.instance(i)); flushInput(); } return (numReturnedFromParser > 0); } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration listOptions() { Vector result; result = new Vector(); result.addElement(new Option("\tThe expression to use for filtering\n" + "\t(default: true).", "E", 1, "-E <expr>")); result.addElement(new Option("\tApply the filter to instances that arrive after the first\n" + "\t(training) batch. The default is to not apply the filter (i.e.\n" + "\talways return the instance)", "F", 0, "-F")); return result.elements(); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -E <expr> * The expression to use for filtering * (default: true). * </pre> * * <pre> * -F * Apply the filter to instances that arrive after the first * (training) batch. The default is to not apply the filter (i.e. * always return the instance) * </pre> * * <!-- options-end --> * * @param options * the list of options as an array of strings * @throws Exception * if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String tmpStr; tmpStr = Utils.getOption('E', options); if (tmpStr.length() != 0) setExpression(tmpStr); else setExpression("true"); m_filterAfterFirstBatch = Utils.getFlag('F', options); if (getInputFormat() != null) setInputFormat(getInputFormat()); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector<String> result; result = new Vector(); result.add("-E"); result.add("" + getExpression()); if (m_filterAfterFirstBatch) { result.add("-F"); } return result.toArray(new String[result.size()]); } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.STRING_ATTRIBUTES); result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.STRING_CLASS); result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Sets the expression used for filtering. * * @param value * the expression */ public void setExpression(String value) { m_Expression = value; } /** * Returns the expression used for filtering. * * @return the expression */ public String getExpression() { return m_Expression; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the explorer/experimenter gui */ public String expressionTipText() { return "The expression to used for filtering the dataset."; } /** * Set whether to apply the filter to instances that arrive once the first (training) batch has been seen. The default is to not apply the filter and just return each instance input. This is so that, when used in the FilteredClassifier, a test instance does not get "consumed" by the filter and a prediction is always generated. * * @param b * true if the filter should be applied to instances that arrive after the first (training) batch has been processed. */ public void setFilterAfterFirstBatch(boolean b) { m_filterAfterFirstBatch = b; } /** * Get whether to apply the filter to instances that arrive once the first (training) batch has been seen. The default is to not apply the filter and just return each instance input. This is so that, when used in the FilteredClassifier, a test instance does not get "consumed" by the filter and a prediction is always generated. * * @return true if the filter should be applied to instances that arrive after the first (training) batch has been processed. */ public boolean getFilterAfterFirstBatch() { return m_filterAfterFirstBatch; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the explorer/experimenter gui */ public String filterAfterFirstBatchTipText() { return "Whether to apply the filtering process to instances that " + "are input after the first (training) batch. The default " + "is false so that, when used in a FilteredClassifier, test" + " instances do not potentially get 'consumed' by the filter " + "an a prediction is always made."; } /** * Determines the output format based on the input format and returns this. * * @param inputFormat * the input format to base the output format on * @return the output format * @throws Exception * in case the determination goes wrong */ @Override protected Instances determineOutputFormat(Instances inputFormat) throws Exception { return new Instances(inputFormat, 0); } /** * Processes the given data (may change the provided dataset) and returns the modified version. This method is called in batchFinished(). * * @param instances * the data to process * @return the modified data * @throws Exception * in case the processing goes wrong * @see #batchFinished() */ @Override protected Instances process(Instances instances) throws Exception { return instances; /*if (!isFirstBatchDone() || m_filterAfterFirstBatch) { return Parser.filter(m_Expression, instances); } else { return instances; }*/ } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 9804 $"); } /** * Main method for running this filter. * * @param args * arguments for the filter: use -h for help */ public static void main(String[] args) { runFilter(new SubsetByExpression(), args); } }