/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.classifier.df.data; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.mahout.classifier.df.data.Dataset.Attribute; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.List; import java.util.Scanner; import java.util.Set; import java.util.regex.Pattern; /** * Converts the input data to a Vector Array using the information given by the Dataset.<br> * Generates for each line a Vector that contains :<br> * <ul> * <li>double parsed value for NUMERICAL attributes</li> * <li>int value for CATEGORICAL and LABEL attributes</li> * </ul> * <br> * adds an IGNORED first attribute that will contain a unique id for each instance, which is the line number * of the instance in the input data */ public final class DataLoader { private static final Logger log = LoggerFactory.getLogger(DataLoader.class); private static final Pattern COMMA_SPACE = Pattern.compile("[, ]"); private DataLoader() { } /** * Converts a comma-separated String to a Vector. * * @param attrs * attributes description * @param values * used to convert CATEGORICAL attribute values to Integer * @return false if there are missing values '?' or NUMERICAL attribute values is not numeric */ private static boolean parseString(Attribute[] attrs, Set<String>[] values, CharSequence string, boolean regression) { String[] tokens = COMMA_SPACE.split(string); Preconditions.checkArgument(tokens.length == attrs.length, "Wrong number of attributes in the string"); // extract tokens and check is there is any missing value for (int attr = 0; attr < attrs.length; attr++) { if (attrs[attr].isIgnored()) { continue; } if ("?".equals(tokens[attr])) { return false; // missing value } } for (int attr = 0; attr < attrs.length; attr++) { if (attrs[attr].isIgnored()) { continue; } String token = tokens[attr]; if (attrs[attr].isCategorical() || (!regression && attrs[attr].isLabel())) { // update values if (values[attr] == null) { values[attr] = Sets.newHashSet(); } values[attr].add(token); } else { try { Double.parseDouble(token); } catch (NumberFormatException e) { return false; } } } return true; } /** * Loads the data from a file * * @param fs * file system * @param fpath * data file path * @throws IOException * if any problem is encountered */ public static Data loadData(Dataset dataset, FileSystem fs, Path fpath) throws IOException { FSDataInputStream input = fs.open(fpath); Scanner scanner = new Scanner(input); List<Instance> instances = Lists.newArrayList(); DataConverter converter = new DataConverter(dataset); while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.isEmpty()) { log.warn("{}: empty string", instances.size()); continue; } Instance instance = converter.convert(line); if (instance == null) { // missing values found log.warn("{}: missing values", instances.size()); continue; } instances.add(instance); } scanner.close(); return new Data(dataset, instances); } /** * Loads the data from a String array */ public static Data loadData(Dataset dataset, String[] data) { List<Instance> instances = Lists.newArrayList(); DataConverter converter = new DataConverter(dataset); for (String line : data) { if (line.isEmpty()) { log.warn("{}: empty string", instances.size()); continue; } Instance instance = converter.convert(line); if (instance == null) { // missing values found log.warn("{}: missing values", instances.size()); continue; } instances.add(instance); } return new Data(dataset, instances); } /** * Generates the Dataset by parsing the entire data * * @param descriptor * attributes description * @param regression * if true, the label is numerical * @param fs * file system * @param path * data path */ public static Dataset generateDataset(CharSequence descriptor, boolean regression, FileSystem fs, Path path) throws DescriptorException, IOException { Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); FSDataInputStream input = fs.open(path); Scanner scanner = new Scanner(input); // used to convert CATEGORICAL attribute to Integer @SuppressWarnings("unchecked") Set<String>[] valsets = new Set[attrs.length]; int size = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.isEmpty()) { continue; } if (parseString(attrs, valsets, line, regression)) { size++; } } scanner.close(); @SuppressWarnings("unchecked") List<String>[] values = new List[attrs.length]; for (int i = 0; i < valsets.length; i++) { if (valsets[i] != null) { values[i] = Lists.newArrayList(valsets[i]); } } return new Dataset(attrs, values, size, regression); } /** * Generates the Dataset by parsing the entire data * * @param descriptor * attributes description */ public static Dataset generateDataset(CharSequence descriptor, boolean regression, String[] data) throws DescriptorException { Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); // used to convert CATEGORICAL attributes to Integer @SuppressWarnings("unchecked") Set<String>[] valsets = new Set[attrs.length]; int size = 0; for (String aData : data) { if (aData.isEmpty()) { continue; } if (parseString(attrs, valsets, aData, regression)) { size++; } } @SuppressWarnings("unchecked") List<String>[] values = new List[attrs.length]; for (int i = 0; i < valsets.length; i++) { if (valsets[i] != null) { values[i] = Lists.newArrayList(valsets[i]); } } return new Dataset(attrs, values, size, regression); } }