DescriptorUtils.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.classifier.df.data;

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import org.apache.mahout.classifier.df.data.Dataset.Attribute;

import java.util.List;
import java.util.Locale;

/**
 * Contains various methods that deal with descriptor strings
 */
public final class DescriptorUtils {

  private static final Splitter SPACE = Splitter.on(' ').omitEmptyStrings();

  private DescriptorUtils() { }
  
  /**
   * Parses a descriptor string and generates the corresponding array of Attributes
   * 
   * @throws DescriptorException
   *           if a bad token is encountered
   */
  public static Attribute[] parseDescriptor(CharSequence descriptor) throws DescriptorException {
    List<Attribute> attributes = Lists.newArrayList();
    for (String token : SPACE.split(descriptor)) {
      token = token.toUpperCase(Locale.ENGLISH);
      if ("I".equals(token)) {
        attributes.add(Attribute.IGNORED);
      } else if ("N".equals(token)) {
        attributes.add(Attribute.NUMERICAL);
      } else if ("C".equals(token)) {
        attributes.add(Attribute.CATEGORICAL);
      } else if ("L".equals(token)) {
        attributes.add(Attribute.LABEL);
      } else {
        throw new DescriptorException("Bad Token : " + token);
      }
    }
    return attributes.toArray(new Attribute[attributes.size()]);
  }
  
  /**
   * Generates a valid descriptor string from a user-friendly representation.<br>
   * for example "3 N I N N 2 C L 5 I" generates "N N N I N N C C L I I I I I".<br>
   * this useful when describing datasets with a large number of attributes
   * @throws DescriptorException
   */
  public static String generateDescriptor(CharSequence description) throws DescriptorException {
    return generateDescriptor(SPACE.split(description));
  }
  
  /**
   * Generates a valid descriptor string from a list of tokens
   * @throws DescriptorException
   */
  public static String generateDescriptor(Iterable<String> tokens) throws DescriptorException {
    StringBuilder descriptor = new StringBuilder();
    
    int multiplicator = 0;
    
    for (String token : tokens) {
      try {
        // try to parse an integer
        int number = Integer.parseInt(token);
        
        if (number <= 0) {
          throw new DescriptorException("Multiplicator (" + number + ") must be > 0");
        }
        if (multiplicator > 0) {
          throw new DescriptorException("A multiplicator cannot be followed by another multiplicator");
        }
        
        multiplicator = number;
      } catch (NumberFormatException e) {
        // token is not a number
        if (multiplicator == 0) {
          multiplicator = 1;
        }
        
        for (int index = 0; index < multiplicator; index++) {
          descriptor.append(token).append(' ');
        }
        
        multiplicator = 0;
      }
    }
    
    return descriptor.toString().trim();
  }
}