/* * Apache License * Version 2.0, January 2004 * http://www.apache.org/licenses/ * * Copyright 2013 Aurelian Tutuianu * Copyright 2014 Aurelian Tutuianu * Copyright 2015 Aurelian Tutuianu * Copyright 2016 Aurelian Tutuianu * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package rapaio.data; import java.util.*; import java.util.function.BiConsumer; import java.util.function.BinaryOperator; import java.util.function.Function; import java.util.function.Supplier; import java.util.stream.Collector; /** * Nominal var contains values for categorical observations where order of labels is not important. * <p> * The domain of the definition is called levels and is given at construction time or can be changed latter. * <p> * This type of variable accepts two value representation: as labels and as indexes. * <p> * Label representation is the natural representation since in experiments * the nominal vectors are given as string values. * <p> * The index representation is learn based on the term levels and is used often for performance * reasons instead of label representation, where the actual label value does not matter. * <p> * Even if index values is an integer number the order of the indexes for * nominal variables is irrelevant. * * @author Aurelian Tutuianu */ public final class Nominal extends FactorBase { /** * Builds a new empty nominal variable * * @return new variable instance of nominal type */ public static Nominal empty() { return new Nominal(); } /** * Builds a new nominal variable of given size, with given term levels, filled with missing values. * * @param rows variable size * @param dict term levels * @return new variable instance of nominal type */ public static Nominal empty(int rows, String... dict) { return Nominal.empty(rows, Arrays.asList(dict)); } /** * Builds a new nominal variable of given size, with given term levels, filled with missing values. * * @param rows variable size * @param dict term levels * @return new variable instance of nominal type */ public static Nominal empty(int rows, List<String> dict) { Nominal nominal = new Nominal(); HashSet<String> used = new HashSet<>(); used.add("?"); for (String next : dict) { if (used.contains(next)) continue; used.add(next); nominal.dict.add(next); nominal.reverse.put(next, nominal.reverse.size()); } nominal.data = new int[rows]; nominal.rows = rows; return nominal; } public static Nominal copy(String... values) { Nominal nominal = Nominal.empty(); for (String value : values) nominal.addLabel(value); return nominal; } public static Nominal copy(List<String> values) { Nominal nominal = Nominal.empty(); for (String value : values) nominal.addLabel(value); return nominal; } public static Nominal from(int rows, Function<Integer, String> func, String... dict) { Nominal nominal = Nominal.empty(rows, dict); for (int i = 0; i < rows; i++) { nominal.setLabel(i, func.apply(i)); } return nominal; } private static final long serialVersionUID = 1645571732133272467L; private Nominal() { // set the missing value this.reverse = new HashMap<>(); this.reverse.put("?", 0); this.dict = new ArrayList<>(); this.dict.add("?"); data = new int[0]; rows = 0; } public static Collector<String, Nominal, Nominal> collector() { return new Collector<String, Nominal, Nominal>() { @Override public Supplier<Nominal> supplier() { return Nominal::empty; } @Override public BiConsumer<Nominal, String> accumulator() { return FactorBase::addLabel; } @Override public BinaryOperator<Nominal> combiner() { return (left, right) -> (Nominal) left.bindRows(right); } @Override public Function<Nominal, Nominal> finisher() { return Nominal::solidCopy; } @Override public Set<Collector.Characteristics> characteristics() { return EnumSet.of(Collector.Characteristics.CONCURRENT, Collector.Characteristics.IDENTITY_FINISH); } }; } @Override public Nominal withName(String name) { return (Nominal) super.withName(name); } @Override public VarType type() { return VarType.NOMINAL; } @Override public void addRows(int rowCount) { grow(rows + rowCount); for (int i = 0; i < rowCount; i++) { data[rows + i] = 0; } rows += rowCount; } @Override public Var newInstance(int rows) { return Nominal.empty(rows, levels()); } @Override public Nominal solidCopy() { return (Nominal) super.solidCopy(); } @Override public String toString() { return "Nominal[name:" + name() + ", rowCount:" + rowCount() + "]"; } }