package edu.uncc.cs.watsonsim.nlp;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;
import edu.uncc.cs.watsonsim.KV;
public class DenseVectors {
public static final int N = 300;
private static final KV kv = new KV();
/**
* Possibly get a vector context for a word (otherwise an empty Optional)
* @param word The word in question
* @return A Optional<float[]> for that word, or Optional.empty()
*/
public static Optional<float[]> vectorFor(String word) {
if (word == null || word.isEmpty()) {
return Optional.empty();
} else {
return kv.get("big-glove", word).map(KV::asVector);
}
}
/**
* Find the cosine similarity of two vectors, which may or may not exist.
* This is pessimistic, saying that if we have never seen a word before, it
* is probably unrelated to everyone
* @return
*/
public static double sim(float[] left, float[] right) {
/*
* A.T * B
* -----------------------
* sqrt(A.T*A) sqrt(B.T*B)
*/
assert left.length == N;
assert right.length == N;
double ab = 0.0, aa = 0.0, bb = 0.0;
for (int i=0; i<Math.min(left.length, right.length); i++) {
ab += left [i] * right[i];
aa += left [i] * left [i];
bb += right[i] * right[i];
}
if (aa == 0.0 || bb == 0.0) return 0;
else return ab / (Math.sqrt(aa) * Math.sqrt(bb));
}
/**
* Tiny wrapper around sim(float[], float[]) for optional-word situations
*/
public static double sim(Optional<float[]> left, Optional<float[]> right) {
if (left.isPresent() && right.isPresent())
return sim(left.get(), right.get());
else
return 0.0;
}
/**
* Average some vectors, as a multi-word model. This is not very meaningful
* and may do strange things for the semantics. (e.g. we plan to do better)
*/
public static float[] mean(List<float[]> vecs) {
float[] mean = new float[N];
int count = 0;
for (float[] vec: vecs) {
for (int i=0; i<N; i++) mean[i] += vec[i];
count++;
}
if (count>0) for (int i=0; i<N; i++) mean[i] /= count;
return mean;
}
/**
* Multiply many vectors, as a multi-word model. It can be better than mean
* but it's still not a syntactic parse.
*/
public static float[] logproduct(List<float[]> vecs) {
float[] logprod = new float[N];
int count = 0;
for (float[] vec: vecs) {
for (int i=0; i<N; i++) logprod[i] += Math.log(Math.abs(vec[i]));
count++;
}
for (int i=0; i<N; i++) logprod[i] /= count;
return logprod;
}
}