/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors.arff; import com.google.common.base.Charsets; import com.google.common.io.Files; import org.apache.mahout.math.Vector; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.nio.charset.Charset; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Iterator; import java.util.Locale; import java.util.regex.Pattern; /** * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s * <p/> * Attribute type handling: * <ul> * <li>Numeric -> As is</li> * <li>Nominal -> ordinal(value) i.e. @attribute lumber {'\'(-inf-0.5]\'','\'(0.5-inf)\''} * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li> * <li>Dates -> Convert to time as a long</li> * <li>Strings -> Create a map of String -> long</li> * </ul> * NOTE: This class does not set the label bindings on every vector. If you want the label * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are the same for every vector. */ public class ARFFVectorIterable implements Iterable<Vector> { private static final Pattern COMMA_PATTERN = Pattern.compile(","); private static final Pattern SPACE_PATTERN = Pattern.compile(" "); private final BufferedReader buff; private final ARFFModel model; public ARFFVectorIterable(File file, ARFFModel model) throws IOException { this(file, Charsets.UTF_8, model); } public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) throws IOException { this(Files.newReader(file, encoding), model); } public ARFFVectorIterable(String arff, ARFFModel model) throws IOException { this(new StringReader(arff), model); } public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException { if (reader instanceof BufferedReader) { buff = (BufferedReader) reader; } else { buff = new BufferedReader(reader); } //grab the attributes, then start the iterator at the first line of data this.model = model; int labelNumber = 0; String line; while ((line = buff.readLine()) != null) { line = line.trim(); String lower = line.toLowerCase(Locale.ENGLISH); Integer labelNumInt = labelNumber; if (lower.startsWith(ARFFModel.ARFF_COMMENT)) { continue; } else if (lower.startsWith(ARFFModel.RELATION)) { model.setRelation(line.substring(ARFFModel.RELATION.length()).trim()); } else if (lower.startsWith(ARFFModel.ATTRIBUTE)) { String label; ARFFType type; if (lower.contains(ARFFType.NUMERIC.getIndicator())) { label = ARFFType.NUMERIC.getLabel(lower); type = ARFFType.NUMERIC; } else if (lower.contains(ARFFType.STRING.getIndicator())) { label = ARFFType.STRING.getLabel(lower); type = ARFFType.STRING; } else if (lower.contains(ARFFType.NOMINAL.getIndicator())) { label = ARFFType.NOMINAL.getLabel(lower); type = ARFFType.NOMINAL; //@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator()); String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1)); for (int i = 0; i < classes.length; i++) { model.addNominal(label, classes[i].trim(), i + 1); } } else if (lower.contains(ARFFType.DATE.getIndicator())) { label = ARFFType.DATE.getLabel(lower); type = ARFFType.DATE; //TODO: DateFormatter map DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); int idx = lower.lastIndexOf(ARFFType.DATE.getIndicator()); String[] split = SPACE_PATTERN.split(line); if (split.length >= 4) { //we have a date format String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim(); if (formStr.startsWith("\"")) { formStr = formStr.substring(1, formStr.length() - 1); } format = new SimpleDateFormat(formStr, Locale.ENGLISH); } model.addDateFormat(labelNumInt, format); //@attribute <name> date [<date-format>] } else { throw new UnsupportedOperationException("Invalid attribute: " + line); } model.addLabel(label, labelNumInt); model.addType(labelNumInt, type); labelNumber++; } else if (lower.startsWith(ARFFModel.DATA)) { //inData = true; break; //skip it } } } @Override public Iterator<Vector> iterator() { return new ARFFIterator(buff, model); } /** * Returns info about the ARFF content that was parsed. * * @return the model */ public ARFFModel getModel() { return model; } }