/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.core.api.embeddings;
import java.io.IOException;
/**
* An interface for vectorizers mapping tokens to embedding vectors.
*
* @see org.dkpro.core.api.embeddings.binary.BinaryVectorizer
* @see org.dkpro.core.api.embeddings.text.TextFormatVectorizer
*/
public interface Vectorizer
{
/**
* Get the vector for a token. If the token is unknown, implementing classes should return the
* {@link #unknownVector()}.
*
* @param token
* a token String
* @return a float array
* @throws IOException
* if there was an error accessing the vector file.
*/
float[] vectorize(String token)
throws IOException;
/**
* True if the token is known by the vectorizer.
*
* @param token a token String
* @return true if the token is known
*/
boolean contains(String token);
/**
* The vector for unknown tokens.
*
* @return a float array
*/
float[] unknownVector();
/**
* The dimensionality of the embeddings
*
* @return an int
*/
int dimensions();
/**
* The total number of known tokens.
*
* @return an int
*/
int size();
boolean isCaseless();
}