/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.utils.vectors.arff;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.collect.AbstractIterator;
import com.google.common.io.Closeables;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
final class ARFFIterator extends AbstractIterator<Vector> {
// This pattern will make sure a , inside a string is not a point for split.
// Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, PDT" as one string
private static final Pattern WORDS_WITHOUT_SPARSE = Pattern.compile("([\\w[^{]])*");
private static final Pattern DATA_PATTERN = Pattern.compile("^\\"+ARFFModel.ARFF_SPARSE+"(.*)\\"+ARFFModel.ARFF_SPARSE_END+"$");
private final BufferedReader reader;
private final ARFFModel model;
ARFFIterator(BufferedReader reader, ARFFModel model) {
this.reader = reader;
this.model = model;
}
@Override
protected Vector computeNext() {
String line;
try {
while ((line = reader.readLine()) != null) {
line = line.trim();
if (!line.isEmpty() && !line.startsWith(ARFFModel.ARFF_COMMENT)) {
break;
}
}
} catch (IOException ioe) {
throw new IllegalStateException(ioe);
}
if (line == null) {
try {
Closeables.close(reader, true);
} catch (IOException e) {
throw new IllegalStateException(e);
}
return endOfData();
}
Vector result;
Matcher contents = DATA_PATTERN.matcher(line);
if (contents.find()) {
line = contents.group(1);
String[] splits = splitCSV(line);
result = new RandomAccessSparseVector(model.getLabelSize());
for (String split : splits) {
int idIndex = split.indexOf(' ');
int idx = Integer.parseInt(split.substring(0, idIndex).trim());
String data = split.substring(idIndex).trim();
if (!"?".equals(data)) {
result.setQuick(idx, model.getValue(data, idx));
}
}
} else {
result = new DenseVector(model.getLabelSize());
String[] splits = splitCSV(line);
for (int i = 0; i < splits.length; i++) {
String split = splits[i];
split = split.trim();
if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && !"?".equals(split)) {
result.setQuick(i, model.getValue(split, i));
}
}
}
return result;
}
/**
* Splits a string by comma, ignores commas inside quotes and escaped quotes.
* As quotes are both double and single possible, because there is no exact definition
* for ARFF files
* @param line -
* @return String[]
*/
public static String[] splitCSV(String line) {
StringBuilder sb = new StringBuilder(128);
List<String> tokens = new ArrayList<>();
char escapeChar = '\0';
for (int i = 0; i < line.length(); i++) {
char c = line.charAt(i);
if (c == '\\') {
i++;
sb.append(line.charAt(i));
}
else if (c == '"' || c == '\'') {
// token is closed
if (c == escapeChar) {
escapeChar = '\0';
}
else if (escapeChar == '\0') {
escapeChar = c;
}
sb.append(c);
}
else if (c == ',') {
if (escapeChar == '\0') {
tokens.add(sb.toString().trim());
sb.setLength(0); // start work on next token
}
else {
sb.append(c);
}
}
else {
sb.append(c);
}
}
if (sb.length() > 0) {
tokens.add(sb.toString().trim());
}
return tokens.toArray(new String[tokens.size()]);
}
}