/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.vectorizer.encoders;
import com.google.common.base.Charsets;
import java.util.Collections;
import java.util.Map;
/**
* Encodes a categorical values with an unbounded vocabulary. Values are encoding by incrementing a
* few locations in the output vector with a weight that is either defaulted to 1 or that is looked
* up in a weight dictionary. By default, only one probe is used which should be fine but could
* cause a decrease in the speed of learning because more features will be non-zero. If a large
* feature vector is used so that the probability of feature collisions is suitably small, then this
* can be decreased to 1. If a very small feature vector is used, the number of probes should
* probably be increased to 3.
*/
public class StaticWordValueEncoder extends WordValueEncoder {
private Map<String, Double> dictionary;
private double missingValueWeight = 1;
private final byte[] nameBytes;
public StaticWordValueEncoder(String name) {
super(name);
nameBytes = bytesForString(name);
}
@Override
protected int hashForProbe(byte[] originalForm, int dataSize, String name, int probe) {
return hash(nameBytes, originalForm, WORD_LIKE_VALUE_HASH_SEED + probe, dataSize);
}
/**
* Sets the weighting dictionary to be used by this encoder. Also sets the missing value weight
* to be half the smallest weight in the dictionary.
*
* @param dictionary The dictionary to use to look up weights.
*/
public void setDictionary(Map<String, Double> dictionary) {
this.dictionary = dictionary;
missingValueWeight = Collections.min(dictionary.values()) / 2;
}
/**
* Sets the weight that is to be used for values that do not appear in the dictionary.
*
* @param missingValueWeight The default weight for missing values.
*/
public void setMissingValueWeight(double missingValueWeight) {
this.missingValueWeight = missingValueWeight;
}
@Override
protected double weight(byte[] originalForm) {
double weight = missingValueWeight;
if (dictionary != null) {
String s = new String(originalForm, Charsets.UTF_8);
if (dictionary.containsKey(s)) {
weight = dictionary.get(s);
}
}
return weight;
}
}