/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors.arff; import java.io.BufferedReader; import java.io.IOException; import java.util.regex.Pattern; import com.google.common.collect.AbstractIterator; import com.google.common.io.Closeables; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; final class ARFFIterator extends AbstractIterator<Vector> { // This pattern will make sure a , inside a string is not a point for split. // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string private static final Pattern COMMA_PATTERN = Pattern.compile(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); private final BufferedReader reader; private final ARFFModel model; ARFFIterator(BufferedReader reader, ARFFModel model) { this.reader = reader; this.model = model; } @Override protected Vector computeNext() { String line; try { while ((line = reader.readLine()) != null) { line = line.trim(); if (!line.isEmpty() && !line.startsWith(ARFFModel.ARFF_COMMENT)) { break; } } } catch (IOException ioe) { throw new IllegalStateException(ioe); } if (line == null) { Closeables.closeQuietly(reader); return endOfData(); } Vector result; if (line.startsWith(ARFFModel.ARFF_SPARSE)) { line = line.substring(1, line.length() - 1); String[] splits = COMMA_PATTERN.split(line); result = new RandomAccessSparseVector(model.getLabelSize()); for (String split : splits) { split = split.trim(); int idIndex = split.indexOf(' '); int idx = Integer.parseInt(split.substring(0, idIndex).trim()); String data = split.substring(idIndex).trim(); result.setQuick(idx, model.getValue(data, idx)); } } else { result = new DenseVector(model.getLabelSize()); String[] splits = COMMA_PATTERN.split(line); for (int i = 0; i < splits.length; i++) { result.setQuick(i, model.getValue(splits[i], i)); } } //result.setLabelBindings(labelBindings); return result; } }