/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors.arff; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.nio.charset.Charset; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Iterator; import java.util.Locale; import com.google.common.io.Files; import org.apache.commons.io.Charsets; import org.apache.mahout.math.Vector; /** * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s * <p/> * Attribute type handling: * <ul> * <li>Numeric -> As is</li> * <li>Nominal -> ordinal(value) i.e. @attribute lumber {'\'(-inf-0.5]\'','\'(0.5-inf)\''} * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li> * <li>Dates -> Convert to time as a long</li> * <li>Strings -> Create a map of String -> long</li> * </ul> * NOTE: This class does not set the label bindings on every vector. If you want the label * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are the same for every vector. */ public class ARFFVectorIterable implements Iterable<Vector> { private final BufferedReader buff; private final ARFFModel model; public ARFFVectorIterable(File file, ARFFModel model) throws IOException { this(file, Charsets.UTF_8, model); } public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) throws IOException { this(Files.newReader(file, encoding), model); } public ARFFVectorIterable(String arff, ARFFModel model) throws IOException { this(new StringReader(arff), model); } public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException { if (reader instanceof BufferedReader) { buff = (BufferedReader) reader; } else { buff = new BufferedReader(reader); } //grab the attributes, then start the iterator at the first line of data this.model = model; int labelNumber = 0; String line; while ((line = buff.readLine()) != null) { line = line.trim(); if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) { Integer labelNumInt = labelNumber; String[] lineParts = line.split("[\\s\\t]+", 2); // is it a relation name? if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) { model.setRelation(ARFFType.removeQuotes(lineParts[1])); } // or an attribute else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) { String label; ARFFType type; // split the name of the attribute and its description String[] attrParts = lineParts[1].split("[\\s\\t]+", 2); if (attrParts.length < 2) throw new UnsupportedOperationException("No type for attribute found: " + lineParts[1]); // label is attribute name label = ARFFType.removeQuotes(attrParts[0].toLowerCase()); if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) { type = ARFFType.NUMERIC; } else if (attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) { type = ARFFType.INTEGER; } else if (attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) { type = ARFFType.REAL; } else if (attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) { type = ARFFType.STRING; } else if (attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) { type = ARFFType.NOMINAL; // nominal example: // @ATTRIBUTE class {Iris-setosa,'Iris versicolor',Iris-virginica} String[] classes = ARFFIterator.splitCSV(attrParts[1].substring(1, attrParts[1].length() - 1)); for (int i = 0; i < classes.length; i++) { model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1); } } else if (attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) { type = ARFFType.DATE; //TODO: DateFormatter map DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); String formStr = attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim(); if (!formStr.isEmpty()) { if (formStr.startsWith("\"")) { formStr = formStr.substring(1, formStr.length() - 1); } format = new SimpleDateFormat(formStr, Locale.ENGLISH); } model.addDateFormat(labelNumInt, format); //@attribute <name> date [<date-format>] } else { throw new UnsupportedOperationException("Invalid attribute: " + attrParts[1]); } model.addLabel(label, labelNumInt); model.addType(labelNumInt, type); labelNumber++; } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) { break; //skip it } } } } @Override public Iterator<Vector> iterator() { return new ARFFIterator(buff, model); } /** * Returns info about the ARFF content that was parsed. * * @return the model */ public ARFFModel getModel() { return model; } }