/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors.arff; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; import java.util.Locale; import java.util.Map; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.utils.MahoutTestCase; import org.junit.Test; public final class ARFFVectorIterableTest extends MahoutTestCase { @Test public void testValues() throws Exception { StringBuilder builder = new StringBuilder(); builder.append("%comments").append('\n').append("@RELATION Mahout").append('\n').append( "@ATTRIBUTE foo numeric").append('\n').append("@ATTRIBUTE bar numeric").append('\n').append( "@ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\"").append('\n').append("@ATTRIBUTE junk string") .append('\n').append("@ATTRIBUTE theNominal {c,b,a}").append('\n').append("@DATA").append('\n') .append("1,2, \"2009-01-01 5:55:55\", foo, c").append('\n').append("2,3").append('\n').append( "{0 5,1 23}").append('\n'); ARFFModel model = new MapBackedARFFModel(); ARFFVectorIterable iterable = new ARFFVectorIterable(builder.toString(), model); assertEquals("Mahout", iterable.getModel().getRelation()); Map<String,Integer> bindings = iterable.getModel().getLabelBindings(); assertNotNull(bindings); assertEquals(5, bindings.size()); Iterator<Vector> iter = iterable.iterator(); assertTrue(iter.hasNext()); Vector next = iter.next(); assertNotNull(next); assertTrue("Wrong instanceof", next instanceof DenseVector); assertEquals(1.0, next.get(0), EPSILON); assertEquals(2.0, next.get(1), EPSILON); assertTrue(iter.hasNext()); next = iter.next(); assertNotNull(next); assertTrue("Wrong instanceof", next instanceof DenseVector); assertEquals(2.0, next.get(0), EPSILON); assertEquals(3.0, next.get(1), EPSILON); assertTrue(iter.hasNext()); next = iter.next(); assertNotNull(next); assertTrue("Wrong instanceof", next instanceof RandomAccessSparseVector); assertEquals(5.0, next.get(0), EPSILON); assertEquals(23.0, next.get(1), EPSILON); assertFalse(iter.hasNext()); } @Test public void testDense() throws Exception { ARFFModel model = new MapBackedARFFModel(); Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_DENSE_ARFF, model); int count = 0; for (Vector vector : iterable) { assertTrue("Vector is not dense", vector instanceof DenseVector); count++; } assertEquals(10, count); } @Test public void testSparse() throws Exception { ARFFModel model = new MapBackedARFFModel(); Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_SPARSE_ARFF, model); int count = 0; for (Vector vector : iterable) { assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); count++; } assertEquals(10, count); } @Test public void testNonNumeric() throws Exception { MapBackedARFFModel model = new MapBackedARFFModel(); ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model); int count = 0; for (Vector vector : iterable) { assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); count++; } iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model); Iterator<Vector> iter = iterable.iterator(); Vector firstVector = iter.next(); assertEquals(1.0, firstVector.get(2), 0); assertEquals(10, count); Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap(); assertNotNull(nominalMap); assertEquals(1, nominalMap.size()); Map<String,Integer> noms = nominalMap.get("bar"); assertNotNull("nominals for bar are null", noms); assertEquals(2, noms.size()); Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap(); assertNotNull("Type map null", integerARFFTypeMap); assertEquals(5, integerARFFTypeMap.size()); Map<String,Long> words = model.getWords(); assertNotNull("words null", words); assertEquals(10, words.size()); // System.out.println("Words: " + words); Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap(); assertNotNull("date format null", integerDateFormatMap); assertEquals(1, integerDateFormatMap.size()); } @Test public void testDate() throws Exception { MapBackedARFFModel model = new MapBackedARFFModel(); ARFFVectorIterable iterable = new ARFFVectorIterable(DATE_ARFF, model); Iterator<Vector> iter = iterable.iterator(); Vector firstVector = iter.next(); DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH); Date date = format.parse("2001-07-04T12:08:56"); long result = date.getTime(); assertEquals(result, firstVector.get(1), 0); format = new SimpleDateFormat("yyyy.MM.dd G 'at' HH:mm:ss z", Locale.ENGLISH); date = format.parse("2001.07.04 AD at 12:08:56 PDT"); result = date.getTime(); assertEquals(result, firstVector.get(2), 0); format = new SimpleDateFormat("EEE, MMM d, ''yy", Locale.ENGLISH); date = format.parse("Wed, Jul 4, '01,4 0:08 PM, PDT"); result = date.getTime(); assertEquals(result, firstVector.get(3), 0); format = new SimpleDateFormat("K:mm a, z", Locale.ENGLISH); date = format.parse("0:08 PM, PDT"); result = date.getTime(); assertEquals(result, firstVector.get(4), 0); format = new SimpleDateFormat("yyyyy.MMMMM.dd GGG hh:mm aaa", Locale.ENGLISH); date = format.parse("02001.July.04 AD 12:08 PM"); result = date.getTime(); assertEquals(result, firstVector.get(5), 0); format = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.ENGLISH); date = format.parse("Wed, 4 Jul 2001 12:08:56 -0700"); result = date.getTime(); assertEquals(result, firstVector.get(6), 0); } @Test public void testMultipleNoms() throws Exception { MapBackedARFFModel model = new MapBackedARFFModel(); ARFFVectorIterable iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF, model); int count = 0; for (Vector vector : iterable) { assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); count++; } assertEquals(10, count); Map<String,Map<String,Integer>> nominalMap = iterable.getModel().getNominalMap(); assertNotNull(nominalMap); assertEquals(1, nominalMap.size()); Map<String,Integer> noms = nominalMap.get("bar"); assertNotNull("nominals for bar are null", noms); assertEquals(2, noms.size()); Map<Integer,ARFFType> integerARFFTypeMap = model.getTypeMap(); assertNotNull("Type map null", integerARFFTypeMap); assertEquals(5, integerARFFTypeMap.size()); Map<String,Long> words = model.getWords(); assertNotNull("words null", words); assertEquals(10, words.size()); // System.out.println("Words: " + words); Map<Integer,DateFormat> integerDateFormatMap = model.getDateMap(); assertNotNull("date format null", integerDateFormatMap); assertEquals(1, integerDateFormatMap.size()); model = new MapBackedARFFModel(model.getWords(), model.getWordCount(), model.getNominalMap()); iterable = new ARFFVectorIterable(NON_NUMERIC_ARFF2, model); count = 0; for (Vector vector : iterable) { assertTrue("Vector is not dense", vector instanceof RandomAccessSparseVector); count++; } nominalMap = model.getNominalMap(); assertNotNull(nominalMap); assertEquals(2, nominalMap.size()); noms = nominalMap.get("test"); assertNotNull("nominals for bar are null", noms); assertEquals(2, noms.size()); } private static final String SAMPLE_DENSE_ARFF = " % Comments\n" + " % \n" + " % Comments go here" + " % \n" + " @RELATION Mahout\n" + '\n' + " @ATTRIBUTE foo NUMERIC\n" + " @ATTRIBUTE bar NUMERIC\n" + " @ATTRIBUTE hockey NUMERIC\n" + " @ATTRIBUTE football NUMERIC\n" + " \n" + '\n' + '\n' + " @DATA\n" + " 23.1,3.23,1.2,0.2\n" + " 2.9,3.0,1.2,0.2\n" + " 2.7,3.2,1.3,0.2\n" + " 2.6,3.1,1.23,0.2\n" + " 23.0,3.6,1.2,0.2\n" + " 23.2,3.9,1.7,0.2\n" + " 2.6,3.2,1.2,0.3\n" + " 23.0,3.2,1.23,0.2\n" + " 2.2,2.9,1.2,0.2\n" + " 2.9,3.1,1.23,0.1\n"; private static final String SAMPLE_SPARSE_ARFF = " % Comments\n" + " % \n" + " % Comments go here" + " % \n" + " @RELATION Mahout\n" + '\n' + " @ATTRIBUTE foo NUMERIC\n" + " @ATTRIBUTE bar NUMERIC\n" + " @ATTRIBUTE hockey NUMERIC\n" + " @ATTRIBUTE football NUMERIC\n" + " @ATTRIBUTE tennis NUMERIC\n" + " \n" + '\n' + '\n' + " @DATA\n" + " {1 23.1,2 3.23,3 1.2,4 0.2}\n" + " {0 2.9}\n" + " {0 2.7,2 3.2,3 1.3,4 0.2}\n" + " {1 2.6,2 3.1,3 1.23,4 0.2}\n" + " {1 23.0,2 3.6,3 1.2,4 0.2}\n" + " {0 23.2,1 3.9,3 1.7,4 0.2}\n" + " {0 2.6,1 3.2,2 1.2,4 0.3}\n" + " {1 23.0,2 3.2,3 1.23}\n" + " {1 2.2,2 2.94,3 0.2}\n" + " {1 2.9,2 3.1}\n"; private static final String NON_NUMERIC_ARFF = " % Comments\n" + " % \n" + " % Comments go here" + " % \n" + " @RELATION Mahout\n" + '\n' + " @ATTRIBUTE junk NUMERIC\n" + " @ATTRIBUTE foo NUMERIC\n" + " @ATTRIBUTE bar {c,d}\n" + " @ATTRIBUTE hockey string\n" + " @ATTRIBUTE football date \"yyyy-MM-dd\"\n" + " \n" + '\n' + '\n' + " @DATA\n" + " {2 c,3 gretzky,4 1973-10-23}\n" + " {1 2.9,2 d,3 orr,4 1973-11-23}\n" + " {2 c,3 bossy,4 1981-10-23}\n" + " {1 2.6,2 c,3 lefleur,4 1989-10-23}\n" + " {3 esposito,4 1973-04-23}\n" + " {1 23.2,2 d,3 chelios,4 1999-2-23}\n" + " {3 richard,4 1973-10-12}\n" + " {3 howe,4 1983-06-23}\n" + " {0 2.2,2 d,3 messier,4 2008-11-23}\n" + " {2 c,3 roy,4 1973-10-13}\n"; private static final String NON_NUMERIC_ARFF2 = " % Comments\n" + " % \n" + " % Comments go here" + " % \n" + " @RELATION Mahout\n" + '\n' + " @ATTRIBUTE junk NUMERIC\n" + " @ATTRIBUTE foo NUMERIC\n" + " @ATTRIBUTE test {f,z}\n" + " @ATTRIBUTE hockey string\n" + " @ATTRIBUTE football date \"yyyy-MM-dd\"\n" + " \n" + '\n' + '\n' + " @DATA\n" + " {2 f,3 gretzky,4 1973-10-23}\n" + " {1 2.9,2 z,3 orr,4 1973-11-23}\n" + " {2 f,3 bossy,4 1981-10-23}\n" + " {1 2.6,2 f,3 lefleur,4 1989-10-23}\n" + " {3 esposito,4 1973-04-23}\n" + " {1 23.2,2 z,3 chelios,4 1999-2-23}\n" + " {3 richard,4 1973-10-12}\n" + " {3 howe,4 1983-06-23}\n" + " {0 2.2,2 f,3 messier,4 2008-11-23}\n" + " {2 f,3 roy,4 1973-10-13}\n"; private static final String DATE_ARFF = " % Comments\n" + " % \n" + " % Comments go here" + " % \n" + " @RELATION MahoutDateTest\n" + '\n' + " @ATTRIBUTE junk NUMERIC\n" + " @ATTRIBUTE date1 \n" + " @ATTRIBUTE date2 date \"yyyy.MM.dd G 'at' HH:mm:ss z\" \n" + " @ATTRIBUTE date3 date \"EEE, MMM d, ''yy\" \n" + " @ATTRIBUTE date4 date \"K:mm a, z\" \n" + " @ATTRIBUTE date5 date \"yyyyy.MMMMM.dd GGG hh:mm aaa\" \n" + " @ATTRIBUTE date6 date \"EEE, d MMM yyyy HH:mm:ss Z\" \n" + " \n" + '\n' + '\n' + " @DATA\n" + " {0 1,1 \"2001-07-04T12:08:56\",2 \"2001.07.04 AD at 12:08:56 PDT\",3 \"Wed, Jul 4, '01,4 0:08 PM, PDT\",4 \"0:08 PM, PDT\", 5 \"02001.July.04 AD 12:08 PM\" ,6 \"Wed, 4 Jul 2001 12:08:56 -0700\" }\n" + " {0 2,1 \"2001-08-04T12:09:56\",2 \"2011.07.04 AD at 12:08:56 PDT\",3 \"Mon, Jul 4, '11,4 0:08 PM, PDT\",4 \"0:08 PM, PDT\", 5 \"02001.July.14 AD 12:08 PM\" ,6 \"Mon, 4 Jul 2011 12:08:56 -0700\" }\n"; }