/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.utils.vectors.arff;
import com.google.common.collect.Maps;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
/**
* Holds ARFF information in {@link Map}.
*/
public class MapBackedARFFModel implements ARFFModel {
private static final Pattern QUOTE_PATTERN = Pattern.compile("\"");
private long wordCount = 1;
private String relation;
private final Map<String,Integer> labelBindings;
private final Map<Integer,String> idxLabel;
private final Map<Integer,ARFFType> typeMap; // key is the vector index, value is the type
private final Map<Integer,DateFormat> dateMap;
private final Map<String,Map<String,Integer>> nominalMap;
private final Map<String,Long> words;
public MapBackedARFFModel() {
this(new HashMap<String,Long>(), 1, new HashMap<String,Map<String,Integer>>());
}
public MapBackedARFFModel(Map<String,Long> words, long wordCount, Map<String,Map<String,Integer>> nominalMap) {
this.words = words;
this.wordCount = wordCount;
labelBindings = Maps.newHashMap();
idxLabel = Maps.newHashMap();
typeMap = Maps.newHashMap();
dateMap = Maps.newHashMap();
this.nominalMap = nominalMap;
}
@Override
public String getRelation() {
return relation;
}
@Override
public void setRelation(String relation) {
this.relation = relation;
}
/**
* Convert a piece of String data at a specific spot into a value
*
* @param data
* The data to convert
* @param idx
* The position in the ARFF data
* @return A double representing the data
*/
@Override
public double getValue(String data, int idx) {
ARFFType type = typeMap.get(idx);
data = QUOTE_PATTERN.matcher(data).replaceAll("");
data = data.trim();
double result;
switch (type) {
case NUMERIC:
result = processNumeric(data);
break;
case DATE:
result = processDate(data, idx);
break;
case STRING:
// may have quotes
result = processString(data);
break;
case NOMINAL:
String label = idxLabel.get(idx);
result = processNominal(label, data);
break;
default:
throw new IllegalStateException("Unknown type: " + type);
}
return result;
}
protected double processNominal(String label, String data) {
double result;
Map<String,Integer> classes = nominalMap.get(label);
if (classes != null) {
Integer ord = classes.get(data);
if (ord != null) {
result = ord;
} else {
throw new IllegalStateException("Invalid nominal: " + data + " for label: " + label);
}
} else {
throw new IllegalArgumentException("Invalid nominal label: " + label + " Data: " + data);
}
return result;
}
// Not sure how scalable this is going to be
protected double processString(String data) {
data = QUOTE_PATTERN.matcher(data).replaceAll("");
// map it to an long
Long theLong = words.get(data);
if (theLong == null) {
theLong = wordCount++;
words.put(data, theLong);
}
return theLong;
}
protected static double processNumeric(String data) {
return Double.parseDouble(data);
}
protected double processDate(String data, int idx) {
DateFormat format = dateMap.get(idx);
if (format == null) {
format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
}
double result;
try {
Date date = format.parse(data);
result = date.getTime(); // hmmm, what kind of loss casting long to double?
} catch (ParseException e) {
throw new IllegalArgumentException(e);
}
return result;
}
/**
* The vector attributes (labels in Mahout speak), unmodifiable
*
* @return the map
*/
@Override
public Map<String,Integer> getLabelBindings() {
return Collections.unmodifiableMap(labelBindings);
}
/**
* The map of types encountered
*
* @return the map
*/
public Map<Integer,ARFFType> getTypeMap() {
return Collections.unmodifiableMap(typeMap);
}
/**
* Map of Date formatters used
*
* @return the map
*/
public Map<Integer,DateFormat> getDateMap() {
return Collections.unmodifiableMap(dateMap);
}
/**
* Map nominals to ids. Should only be modified by calling {@link ARFFModel#addNominal(String, String, int)}
*
* @return the map
*/
@Override
public Map<String,Map<String,Integer>> getNominalMap() {
return nominalMap;
}
/**
* Immutable map of words to the long id used for those words
*
* @return The map
*/
@Override
public Map<String,Long> getWords() {
return words;
}
@Override
public Integer getNominalValue(String label, String nominal) {
return nominalMap.get(label).get(nominal);
}
@Override
public void addNominal(String label, String nominal, int idx) {
Map<String,Integer> noms = nominalMap.get(label);
if (noms == null) {
noms = Maps.newHashMap();
nominalMap.put(label, noms);
}
noms.put(nominal, idx);
}
@Override
public DateFormat getDateFormat(Integer idx) {
return dateMap.get(idx);
}
@Override
public void addDateFormat(Integer idx, DateFormat format) {
dateMap.put(idx, format);
}
@Override
public Integer getLabelIndex(String label) {
return labelBindings.get(label);
}
@Override
public void addLabel(String label, Integer idx) {
labelBindings.put(label, idx);
idxLabel.put(idx, label);
}
@Override
public ARFFType getARFFType(Integer idx) {
return typeMap.get(idx);
}
@Override
public void addType(Integer idx, ARFFType type) {
typeMap.put(idx, type);
}
/**
* The count of the number of words seen
*
* @return the count
*/
@Override
public long getWordCount() {
return wordCount;
}
@Override
public int getLabelSize() {
return labelBindings.size();
}
}