/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.classifier.df.data;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import org.apache.mahout.classifier.df.data.Dataset.Attribute;
import java.util.List;
import java.util.Locale;
/**
* Contains various methods that deal with descriptor strings
*/
public final class DescriptorUtils {
private static final Splitter SPACE = Splitter.on(' ').omitEmptyStrings();
private DescriptorUtils() { }
/**
* Parses a descriptor string and generates the corresponding array of Attributes
*
* @throws DescriptorException
* if a bad token is encountered
*/
public static Attribute[] parseDescriptor(CharSequence descriptor) throws DescriptorException {
List<Attribute> attributes = Lists.newArrayList();
for (String token : SPACE.split(descriptor)) {
token = token.toUpperCase(Locale.ENGLISH);
if ("I".equals(token)) {
attributes.add(Attribute.IGNORED);
} else if ("N".equals(token)) {
attributes.add(Attribute.NUMERICAL);
} else if ("C".equals(token)) {
attributes.add(Attribute.CATEGORICAL);
} else if ("L".equals(token)) {
attributes.add(Attribute.LABEL);
} else {
throw new DescriptorException("Bad Token : " + token);
}
}
return attributes.toArray(new Attribute[attributes.size()]);
}
/**
* Generates a valid descriptor string from a user-friendly representation.<br>
* for example "3 N I N N 2 C L 5 I" generates "N N N I N N C C L I I I I I".<br>
* this useful when describing datasets with a large number of attributes
* @throws DescriptorException
*/
public static String generateDescriptor(CharSequence description) throws DescriptorException {
return generateDescriptor(SPACE.split(description));
}
/**
* Generates a valid descriptor string from a list of tokens
* @throws DescriptorException
*/
public static String generateDescriptor(Iterable<String> tokens) throws DescriptorException {
StringBuilder descriptor = new StringBuilder();
int multiplicator = 0;
for (String token : tokens) {
try {
// try to parse an integer
int number = Integer.parseInt(token);
if (number <= 0) {
throw new DescriptorException("Multiplicator (" + number + ") must be > 0");
}
if (multiplicator > 0) {
throw new DescriptorException("A multiplicator cannot be followed by another multiplicator");
}
multiplicator = number;
} catch (NumberFormatException e) {
// token is not a number
if (multiplicator == 0) {
multiplicator = 1;
}
for (int index = 0; index < multiplicator; index++) {
descriptor.append(token).append(' ');
}
multiplicator = 0;
}
}
return descriptor.toString().trim();
}
}