/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.utils.vectors.arff;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import com.google.common.io.Resources;
import org.apache.commons.io.Charsets;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.junit.Test;
public final class ARFFVectorIterableTest extends MahoutTestCase {
@Test
public void testValues() throws Exception {
ARFFVectorIterable iterable = readModelFromResource("sample.arff");
assertEquals("Mahout", iterable.getModel().getRelation());
Map<String, Integer> bindings = iterable.getModel().getLabelBindings();
assertNotNull(bindings);
assertEquals(5, bindings.size());
Iterator<Vector> iter = iterable.iterator();
assertTrue(iter.hasNext());
Vector next = iter.next();
assertNotNull(next);
assertTrue("Wrong instanceof", next instanceof DenseVector);
assertEquals(1.0, next.get(0), EPSILON);
assertEquals(2.0, next.get(1), EPSILON);
assertTrue(iter.hasNext());
next = iter.next();
assertNotNull(next);
assertTrue("Wrong instanceof", next instanceof DenseVector);
assertEquals(2.0, next.get(0), EPSILON);
assertEquals(3.0, next.get(1), EPSILON);
assertTrue(iter.hasNext());
next = iter.next();
assertNotNull(next);
assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector);
assertEquals(5.0, next.get(0), EPSILON);
assertEquals(23.0, next.get(1), EPSILON);
assertFalse(iter.hasNext());
}
@Test
public void testDense() throws Exception {
Iterable<Vector> iterable = readModelFromResource("sample-dense.arff");
Vector firstVector = iterable.iterator().next();
assertEquals(1.0, firstVector.get(0), 0);
assertEquals(65.0, firstVector.get(1), 0);
assertEquals(1.0, firstVector.get(3), 0);
assertEquals(1.0, firstVector.get(4), 0);
int count = 0;
for (Vector vector : iterable) {
assertTrue("Vector is not dense", vector instanceof DenseVector);
count++;
}
assertEquals(5, count);
}
@Test
public void testSparse() throws Exception {
Iterable<Vector> iterable = readModelFromResource("sample-sparse.arff");
Vector firstVector = iterable.iterator().next();
assertEquals(23.1, firstVector.get(1), 0);
assertEquals(3.23, firstVector.get(2), 0);
assertEquals(1.2, firstVector.get(3), 0);
int count = 0;
for (Vector vector : iterable) {
assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
count++;
}
assertEquals(9, count);
}
@Test
public void testNonNumeric() throws Exception {
MapBackedARFFModel model = new MapBackedARFFModel();
ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
int count = 0;
for (Vector vector : iterable) {
assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
count++;
}
iterable = getVectors("non-numeric-1.arff", model);
Iterator<Vector> iter = iterable.iterator();
Vector firstVector = iter.next();
assertEquals(1.0, firstVector.get(2), 0);
assertEquals(10, count);
Map<String, Map<String, Integer>> nominalMap = iterable.getModel().getNominalMap();
assertNotNull(nominalMap);
assertEquals(1, nominalMap.size());
Map<String, Integer> noms = nominalMap.get("bar");
assertNotNull("nominals for bar are null", noms);
assertEquals(5, noms.size());
Map<Integer, ARFFType> integerARFFTypeMap = model.getTypeMap();
assertNotNull("Type map null", integerARFFTypeMap);
assertEquals(5, integerARFFTypeMap.size());
Map<String, Long> words = model.getWords();
assertNotNull("words null", words);
assertEquals(10, words.size());
Map<Integer, DateFormat> integerDateFormatMap = model.getDateMap();
assertNotNull("date format null", integerDateFormatMap);
assertEquals(1, integerDateFormatMap.size());
}
@Test
public void testDate() throws Exception {
ARFFVectorIterable iterable = readModelFromResource("date.arff");
Iterator<Vector> iter = iterable.iterator();
Vector firstVector = iter.next();
DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
Date date = format.parse("2001-07-04T12:08:56");
long result = date.getTime();
assertEquals(result, firstVector.get(1), 0);
format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", Locale.ENGLISH);
date = format.parse("2001.07.04 AD at 12:08:56 PDT");
result = date.getTime();
assertEquals(result, firstVector.get(2), 0);
format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH);
date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT");
result = date.getTime();
assertEquals(result, firstVector.get(3), 0);
format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH);
date = format.parse("0:08 PM, PDT");
result = date.getTime();
assertEquals(result, firstVector.get(4), 0);
format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", Locale.ENGLISH);
date = format.parse("02001.July.04 AD 12:08 PM");
result = date.getTime();
assertEquals(result, firstVector.get(5), 0);
format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH);
date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700");
result = date.getTime();
assertEquals(result, firstVector.get(6), 0);
}
@Test
public void testMultipleNoms() throws Exception {
MapBackedARFFModel model = new MapBackedARFFModel();
ARFFVectorIterable iterable = getVectors("non-numeric-1.arff", model);
int count = 0;
for (Vector vector : iterable) {
assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
count++;
}
assertEquals(10, count);
Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap();
assertNotNull(nominalMap);
assertEquals(1, nominalMap.size());
Map<String,Integer> noms = nominalMap.get("bar");
assertNotNull("nominals for bar are null", noms);
assertEquals(5, noms.size());
Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap();
assertNotNull("Type map null", integerARFFTypeMap);
assertEquals(5, integerARFFTypeMap.size());
Map<String,Long> words = model.getWords();
assertNotNull("words null", words);
assertEquals(10, words.size());
Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap();
assertNotNull("date format null", integerDateFormatMap);
assertEquals(1, integerDateFormatMap.size());
iterable = getVectors("non-numeric-2.arff", model);
count = 0;
for (Vector vector : iterable) {
assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector);
count++;
}
nominalMap = model.getNominalMap();
assertNotNull(nominalMap);
assertEquals(2, nominalMap.size());
noms = nominalMap.get("test");
assertNotNull("nominals for bar are null", noms);
assertEquals(2, noms.size());
}
@Test
public void testNumerics() throws Exception {
String arff = "@RELATION numerics\n"
+ "@ATTRIBUTE theNumeric NUMERIC\n"
+ "@ATTRIBUTE theInteger INTEGER\n"
+ "@ATTRIBUTE theReal REAL\n"
+ "@DATA\n"
+ "1.0,2,3.0";
ARFFModel model = new MapBackedARFFModel();
ARFFVectorIterable iterable = new ARFFVectorIterable(arff, model);
model = iterable.getModel();
assertNotNull(model);
assertEquals(3, model.getLabelSize());
assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
assertEquals(ARFFType.INTEGER, model.getARFFType(1));
assertEquals(ARFFType.REAL, model.getARFFType(2));
Iterator<Vector> it = iterable.iterator();
Vector vector = it.next();
assertEquals(1.0, vector.get(0), EPSILON);
assertEquals(2.0, vector.get(1), EPSILON);
assertEquals(3.0, vector.get(2), EPSILON);
}
@Test
public void testQuotes() throws Exception {
// ARFF allows quotes on identifiers
ARFFModel model = new MapBackedARFFModel();
ARFFVectorIterable iterable = getVectors("quoted-id.arff", model);
model = iterable.getModel();
assertNotNull(model);
assertEquals("quotes", model.getRelation());
// check attribute labels
assertEquals(4, model.getLabelSize());
assertEquals(ARFFType.NUMERIC, model.getARFFType(0));
assertEquals(ARFFType.INTEGER, model.getARFFType(1));
assertEquals(ARFFType.REAL, model.getARFFType(2));
assertEquals(ARFFType.NOMINAL, model.getARFFType(3));
Map<String, Integer> labelBindings = model.getLabelBindings();
assertTrue(labelBindings.keySet().contains("thenumeric"));
assertTrue(labelBindings.keySet().contains("theinteger"));
assertTrue(labelBindings.keySet().contains("thereal"));
assertTrue(labelBindings.keySet().contains("thenominal"));
// check nominal values
Map<String, Integer> nominalMap = model.getNominalMap().get("thenominal");
assertNotNull(nominalMap);
assertEquals(3, nominalMap.size());
assertTrue(nominalMap.keySet().contains("double-quote"));
assertTrue(nominalMap.keySet().contains("single-quote"));
assertTrue(nominalMap.keySet().contains("no-quote"));
// check data values
Iterator<Vector> it = iterable.iterator();
Vector vector = it.next();
assertEquals(nominalMap.get("no-quote"), vector.get(3), EPSILON);
assertEquals(nominalMap.get("single-quote"), it.next().get(3), EPSILON);
assertEquals(nominalMap.get("double-quote"), it.next().get(3), EPSILON);
}
static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException {
String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8);
return new ARFFVectorIterable(sample, model);
}
private static ARFFVectorIterable readModelFromResource(String resourceName) throws IOException {
ARFFModel model = new MapBackedARFFModel();
return getVectors(resourceName, model);
}
}