/* * Apache License * Version 2.0, January 2004 * http://www.apache.org/licenses/ * * Copyright 2013 Aurelian Tutuianu * Copyright 2014 Aurelian Tutuianu * Copyright 2015 Aurelian Tutuianu * Copyright 2016 Aurelian Tutuianu * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package rapaio.ml.classifier; import rapaio.data.Frame; import rapaio.data.Var; import rapaio.data.VarType; import rapaio.data.filter.FFilter; import rapaio.data.sample.RowSampler; import rapaio.ml.common.Capabilities; import rapaio.printer.Printable; import java.io.Serializable; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.function.BiConsumer; import java.util.stream.Collectors; /** * Interface for all classification model algorithms. * A classifier is able to classify multiple target columns, if implementation allows that. * * @author <a href="mailto:padreati@yahoo.com>Aurelian Tutuianu</a> */ public interface Classifier extends Printable, Serializable { /** * Creates a new classifier instance with the same parameters as the original. * The fitted model and other artifacts are not replicated. * * @return new parametrized instance */ Classifier newInstance(); /** * Returns the classifier name. * * @return classifier name */ String name(); /** * Builds a string which contains the classifier instance name and parameters. * * @return classifier algorithm name and parameters */ String fullName(); /** * Describes the classification algorithm * * @return capabilities of the classification algorithm */ default Capabilities capabilities() { return new Capabilities(); } /** * @return the sampler instance used */ RowSampler sampler(); /** * Specifies the sampler to be used at learning time. * The sampler is responsible for selecting the instances to be learned. * The default implementation is {@link RowSampler.Identity} * which gives all the original training instances. * * @param sampler instance of a new sampler */ Classifier withSampler(RowSampler sampler); /** * Filters which will be applied on input variables * for various transformations, before the data is learned. * <p> * Thus, input variables learned by a model are not derived * directly from the data frame used by removing target * variables, but by pre-processing them with filters. * <p> * Filters will be applied always in sequence. * The filtering process has the following steps: * <p> * <ol> * <li>consider data frame as draft data frame</li> * <li>take in order the filters from input filter list</li> * <li>apply each filter to draft data frame and dessignate the result as draft data frame</li> * <li>after all filters are executed designate draft data frame as the workable data frame</li> * <li>parse all the target variable names from pattern strings and workable data frame</li> * <li>collect all the variable names from workable data frame</li> * <li>collect target variable names from the list of available variable names</li> * <li>collect input variable as all the variables which are not considered target variables</li> * </ol> * <p> * This algorithm is executed each time for {@link #train(Frame, Var, String...)}, * {@link #train(Frame, String...)}, {@link #fit(Frame)} and {@link #fit(Frame, boolean, boolean)} methods. * * @return list of filter to transform data into input variables. */ List<FFilter> inputFilters(); /** * Specifies which filters will be used to transform data * before learning and fitting. * * @param filters list of filters applied in chain * @return self instance */ default Classifier withInputFilters(FFilter... filters) { return withInputFilters(Arrays.stream(filters).collect(Collectors.toList())); } /** * Specifies which filters will be used to transform data * before learning and fitting. * * @param filters list of filters applied in chain * @return self instance */ Classifier withInputFilters(List<FFilter> filters); /** * Returns input variable names built at learning time * * @return input variable names */ String[] inputNames(); /** * Shortcut method which returns input variable name at the * given position * * @param pos given position * @return variable name */ default String inputName(int pos) { return inputNames()[pos]; } /** * Returns the types of input variables built at learning time * * @return array of input variable types */ VarType[] inputTypes(); /** * Shortcut method which returns the type of the input variable at the given position * * @param pos given position * @return variable type */ default VarType inputType(int pos) { return inputTypes()[pos]; } /** * Returns target variables names built at learning time * * @return target variable names */ String[] targetNames(); /** * Returns first target variable built at learning time * * @return target variable names */ default String firstTargetName() { return targetNames()[0]; } /** * Returns the name of the target variable at the given position * * @param pos position of the target variable name * @return name of the target variable */ default String targetName(int pos) { return targetNames()[pos]; } /** * Returns target variable types built at learning time * * @return array of target types */ VarType[] targetTypes(); /** * Shortcut method which returns target variable type * at the given position * * @param pos given position * @return target variable type */ default VarType targetType(int pos) { return targetTypes()[pos]; } /** * Returns levels used at learning times for target variables * * @return map with target variable names as key and levels as variables */ Map<String, String[]> targetLevels(); default String[] targetLevels(String key) { return targetLevels().get(key); } /** * Returns levels used at learning times for first target variables * * @return map with target variable names as key and levels as variables */ default String[] firstTargetLevels() { return targetLevels().get(firstTargetName()); } default String firstTargetLevel(int pos) { return targetLevels().get(firstTargetName())[pos]; } /** * @return true if the classifier has learned from a sample */ boolean hasLearned(); /** * Fit a classifier on instances specified by frame, with row weights * equal to 1 and target specified by targetNames * * @param df data set instances * @param targetVars target variables */ Classifier train(Frame df, String... targetVars); /** * Fit a classifier on instances specified by frame, with row weights and targetNames * * @param df train frame * @param weights instance weights * @param targetVars target variables */ Classifier train(Frame df, Var weights, String... targetVars); /** * Predict classes for new data set instances, with * default options to compute classes and densities for classes. * * @param df data set instances */ CFit fit(Frame df); /** * Predict classes for given instances, generating classes if specified and * distributions if specified. * * @param df frame instances * @param withClasses generate classes * @param withDistributions generate densities for classes */ CFit fit(Frame df, boolean withClasses, boolean withDistributions); /** * set the pool size for fork join tasks * - poolSize == 0 it is executed in a single non fork join thread * - poolSize < 0 pool size for fork join pool is the number of CPUs * - poolSize > 0, pool size for fork join pool is this value * * @param poolSize specified pool size */ Classifier withRunPoolSize(int poolSize); /** * Gets the configured pool size. Negative values are considered * automatically as pool of number of available CPUs, zero means * no pooling and positive values means pooling with a specified * value. * * @return pool size to be used */ int runPoolSize(); /** * @return the number of runs */ int runs(); /** * Specifies the runs / rounds of learning. * For various models composed of multiple sub-models * the runs represents often the number of sub-models. * <p> * For example for CForest the number of runs is used to specify * the number of decision trees to be built. * * @param runs number of runs * @return self-instance, used for builder pattern **/ Classifier withRuns(int runs); /** * Get the lambda call hook which will be called after * each sub-component or iteration specified by {@link #withRuns(int)} * is trained. * * @return lambda running hook */ BiConsumer<Classifier, Integer> runningHook(); /** * Set up a lambda call hook which will be called after * each sub-component or iteration specified by {@link #withRuns(int)} * is trained. * * @param runningHook bi consumer method to be called at each iteration, first * parameter is the model built at the time and the second * parameter value is the run value * @return self-instance of the model */ Classifier withRunningHook(BiConsumer<Classifier, Integer> runningHook); }