/**
* Copyright (c) 2011, the SemanticVectors AUTHORS.
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* Neither the name of the University of Pittsburgh nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
package di.uniba.it.tri.vectors;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Random;
import java.util.logging.Logger;
/**
* Real number implementation of Vector.
*
* <p>
* Supports both sparse and dense formats. Some methods automatically transform
* from sparse to dense format. (Method documentation should cover this when
* there are performance consequences.
*
* @author Dominic Widdows
*/
public class RealVector implements Vector {
/**
* Enumeration of binding operation options. Change at compile-time to
* experiment.
*/
public enum RealBindMethod {
/**
* Uses permutation operations, as in Sahlgren et al. Fast, exact
* inverse, but lossy memory of slot-filling.
*/
PERMUTATION,
/**
* Uses convolution operations, as in Mewhort and Jones, BEAGLE. Slower,
* but optimized using Fast Fourier Transforms. Approximate inverse, but
* keeps memory of slot-filling.
*/
CONVOLUTION
}
/**
*
*/
public static RealBindMethod BIND_METHOD = RealBindMethod.CONVOLUTION;
/**
*
* @param bindMethod
*/
public static void setBindType(RealBindMethod bindMethod) {
logger.info("Globally setting real vector BIND_METHOD to: '" + bindMethod + "'");
BIND_METHOD = bindMethod;
}
/**
*
*/
public static final Logger logger = Logger.getLogger(RealVector.class.getCanonicalName());
/**
* Returns {@link VectorType#REAL}
* @return
*/
public VectorType getVectorType() {
return VectorType.REAL;
}
private final int dimension;
/**
* Dense representation. Coordinates can be anything expressed by floats.
*/
private float[] coordinates;
/**
* Sparse representation. Coordinates can only be +/-1. Array of short
* signed integers, indices to the array locations where a +/-1 entry is
* located. See also {@link generateRandomVector}.
*/
private short[] sparseOffsets;
private boolean isSparse;
/**
*
* @param dimension
*/
protected RealVector(int dimension) {
this.dimension = dimension;
this.sparseOffsets = new short[0];
this.isSparse = true;
}
/**
* Returns a new copy of this vector, in dense format.
* @return
*/
public RealVector copy() {
if (isSparse) {
RealVector copy = new RealVector(dimension);
copy.sparseOffsets = new short[sparseOffsets.length];
for (int i = 0; i < sparseOffsets.length; ++i) {
copy.sparseOffsets[i] = sparseOffsets[i];
}
return copy;
} else {
float[] coordinatesCopy = new float[dimension];
for (int i = 0; i < dimension; ++i) {
coordinatesCopy[i] = coordinates[i];
}
return new RealVector(coordinatesCopy);
}
}
public String toString() {
StringBuilder debugString = new StringBuilder("RealVector.");
// TODO(widdows): Add heap location?
if (isSparse) {
debugString.append(" Sparse. Offsets are:\n");
for (short sparseOffset : sparseOffsets) {
debugString.append(sparseOffset + " ");
}
debugString.append("\n");
} else {
debugString.append(" Dense. Coordinates are:\n");
for (float coordinate : coordinates) {
debugString.append(coordinate + " ");
}
debugString.append("\n");
}
return debugString.toString();
}
@Override
public int getDimension() {
return dimension;
}
/**
*
* @param dimension
* @return
*/
public RealVector createZeroVector(int dimension) {
return new RealVector(dimension);
}
@Override
public boolean isZeroVector() {
if (isSparse) {
return sparseOffsets.length == 0;
} else {
for (float coordinate : coordinates) {
if (coordinate != 0) {
return false;
}
}
return true;
}
}
/**
* Generates a basic sparse vector with mainly zeros and some 1 and -1
* entries (seedLength/2 of each) each vector is an array of length
* seedLength containing 1+ the index of a non-zero value, signed according
* to whether this is a + or -1.
* <br>
* e.g. +20 indicates a +1 in position 19, +1 would indicate a +1 in
* position 0. -20 indicates a -1 in position 19, -1 would indicate a -1 in
* position 0.
* <br>
* The extra offset of +1 is because position 0 would be unsigned, and would
* therefore be wasted. Consequently we've chosen to make the code slightly
* more complicated to make the implementation slightly more space
* efficient.
*
* If seedlength == dimension, a dense real vector is generated instead,
* with each dimension initialized to a real value between -1 and 1
*
* @param seedLength
* @return Sparse representation of basic ternary vector.
*/
public RealVector generateRandomVector(int dimension, int seedLength, Random random) {
RealVector randomVector = new RealVector(dimension);
//allow for dense random vectors, with each value initalized at random between -1 and 1
if (seedLength == dimension) {
return generateDenseRandomVector(dimension, seedLength, random);
}
boolean[] occupiedPositions = new boolean[dimension];
randomVector.sparseOffsets = new short[seedLength];
int testPlace, entryCount = 0;
// Put in +1 entries.
while (entryCount < seedLength / 2) {
testPlace = random.nextInt(dimension);
if (!occupiedPositions[testPlace]) {
occupiedPositions[testPlace] = true;
randomVector.sparseOffsets[entryCount]
= new Integer(testPlace + 1).shortValue();
entryCount++;
}
}
// Put in -1 entries.
while (entryCount < seedLength) {
testPlace = random.nextInt(dimension);
if (!occupiedPositions[testPlace]) {
occupiedPositions[testPlace] = true;
randomVector.sparseOffsets[entryCount]
= new Integer((1 + testPlace) * -1).shortValue();
entryCount++;
}
}
return randomVector;
}
/**
* Generates a basic dense vector with values assigned at random to a real
* value between -1 and 1
*
* @param dimension
* @param seedLength
* @param random
* @return Dense representation of basic real vector.
*/
public RealVector generateDenseRandomVector(int dimension, int seedLength, Random random) {
RealVector randomVector = new RealVector(dimension);
randomVector.sparseToDense();
for (int q = 0; q < dimension; q++) {
randomVector.coordinates[q] = (float) random.nextDouble();
}
for (int q = 0; q < dimension; q++) {
if (random.nextBoolean()) {
randomVector.coordinates[q] *= -1;
}
}
randomVector.normalize();
return randomVector;
}
@Override
/**
* Measures overlap of two vectors using cosine similarity.
*
* Causes this and other vector to be converted to dense representation.
*/
public double measureOverlap(Vector other) {
IncompatibleVectorsException.checkVectorsCompatible(this, other);
if (isZeroVector()) {
return 0;
}
RealVector realOther = (RealVector) other;
if (realOther.isZeroVector()) {
return 0;
}
if (isSparse) {
sparseToDense();
}
if (realOther.isSparse) {
realOther.sparseToDense();
}
double result = 0;
double norm1 = 0;
double norm2 = 0;
for (int i = 0; i < dimension; ++i) {
result += coordinates[i] * realOther.coordinates[i];
norm1 += coordinates[i] * coordinates[i];
norm2 += realOther.coordinates[i] * realOther.coordinates[i];
}
return result / Math.sqrt(norm1 * norm2);
}
@Override
/**
* Adds the other vector to this one. This vector is cast to dense format;
* other vector is left in sparse format if originally sparse.
*/
public void superpose(Vector other, double weight, int[] permutation) {
IncompatibleVectorsException.checkVectorsCompatible(this, other);
RealVector realOther = (RealVector) other;
if (isSparse) {
sparseToDense();
}
if (realOther.isSparse) {
for (int i = 0; i < realOther.sparseOffsets.length; ++i) {
int entry = Integer.signum(realOther.sparseOffsets[i]);
int positionToAdd = Math.abs(realOther.sparseOffsets[i]) - 1;
if (permutation != null) {
positionToAdd = permutation[positionToAdd];
}
coordinates[positionToAdd] += entry * weight;
}
} else {
for (int i = 0; i < dimension; ++i) {
int positionToAdd = i;
if (permutation != null) {
positionToAdd = permutation[positionToAdd];
}
coordinates[positionToAdd] += realOther.coordinates[i] * weight;
}
}
}
@Override
/**
* Implements binding depending on {@link #BIND_TYPE}
*/
public void bind(Vector other) {
IncompatibleVectorsException.checkVectorsCompatible(this, other);
RealVector realOther = (RealVector) other;
if (isSparse) {
sparseToDense();
}
switch (BIND_METHOD) {
case PERMUTATION:
bindWithPermutation(realOther);
return;
case CONVOLUTION:
bindWithConvolution(realOther);
return;
}
}
@Override
/**
* Implements release depending on {@link #BIND_TYPE}
*/
public void release(Vector other) {
IncompatibleVectorsException.checkVectorsCompatible(this, other);
RealVector realOther = (RealVector) other;
if (isSparse) {
sparseToDense();
}
switch (BIND_METHOD) {
case PERMUTATION:
releaseWithPermutation(realOther);
return;
case CONVOLUTION:
releaseWithConvolution(realOther);
return;
}
}
/**
*
* @param realOther
*/
public void bindWithConvolution(RealVector realOther) {
RealVector result = RealVectorUtils.fftConvolution(this, realOther);
this.coordinates = result.coordinates;
}
/**
* Implements release using {@link RealVectorUtils#fftApproxInvConvolution}
* @param other
*/
public void releaseWithConvolution(RealVector other) {
RealVector result = RealVectorUtils.fftApproxInvConvolution(other, this);
this.coordinates = result.coordinates;
}
/**
* Implements binding as a single-shift permutation. Currently wasteful;
* allocates the permutation array each time.
* @param other
*/
public void bindWithPermutation(RealVector other) {
RealVector result = createZeroVector(dimension);
result.superpose(
other, 1, PermutationUtils.getShiftPermutation(VectorType.REAL, dimension, 1));
result.superpose(
this, 1, PermutationUtils.getShiftPermutation(VectorType.REAL, dimension, -1));
this.coordinates = result.coordinates;
}
/**
* Implements release using the {@link #bindWithPermutation}.
* @param other
*/
public void releaseWithPermutation(RealVector other) {
RealVector result = createZeroVector(dimension);
this.superpose(
other, -1, PermutationUtils.getShiftPermutation(VectorType.REAL, dimension, 1));
result.superpose(
this, 1, PermutationUtils.getShiftPermutation(VectorType.REAL, dimension, 1));
this.coordinates = result.coordinates;
}
@Override
/**
* Normalizes the vector, converting sparse to dense representations in the
* process.
*/
public void normalize() {
if (this.isSparse) {
this.sparseToDense();
}
double normSq = 0;
for (int i = 0; i < dimension; ++i) {
normSq += coordinates[i] * coordinates[i];
}
float norm = (float) Math.sqrt(normSq);
for (int i = 0; i < dimension; ++i) {
coordinates[i] = coordinates[i] / norm;
}
}
@Override
/**
* Writes vector out in dense format. If vector is originally sparse, writes
* out a copy so that vector remains sparse.
*/
public void writeToStream(DataOutputStream outputStream) {
float[] coordsToWrite;
if (isSparse) {
RealVector copy = copy();
copy.sparseToDense();
coordsToWrite = copy.coordinates;
} else {
coordsToWrite = coordinates;
}
for (int i = 0; i < dimension; ++i) {
try {
outputStream.writeInt(Float.floatToIntBits(coordsToWrite[i]));
} catch (IOException e) {
e.printStackTrace();
}
}
}
@Override
/**
* Reads a (dense) version of a vector from a Lucene input stream.
*/
public void readFromStream(DataInputStream inputStream) {
if (isSparse) {
coordinates = new float[dimension];
sparseOffsets = null;
isSparse = false;
}
for (int i = 0; i < dimension; ++i) {
try {
coordinates[i] = Float.intBitsToFloat(inputStream.readInt());
} catch (IOException e) {
logger.severe("Failed to parse vector from Lucene stream. This signifies a "
+ "programming or runtime error, e.g., a dimension mismatch.");
e.printStackTrace();
}
}
}
@Override
/**
* Writes vector to a string of the form x1|x2|x3| ... where the x's are the
* coordinates.
*
* No terminating newline or | symbol.
*/
public String writeToString() {
StringBuilder builder = new StringBuilder();
float[] denseCoordinates = this.getCoordinates();
for (int i = 0; i < dimension; ++i) {
builder.append(Float.toString(denseCoordinates[i]));
if (i != dimension - 1) {
builder.append("|");
}
}
return builder.toString();
}
@Override
/**
* Writes vector from a string of the form x1|x2|x3| ... where the x's are
* the coordinates.
*/
public void readFromString(String input) {
String[] entries = input.split("\\|");
if (entries.length != dimension) {
throw new IllegalArgumentException("Found " + (entries.length) + " possible coordinates: "
+ "expected " + dimension);
}
if (isSparse) {
coordinates = new float[dimension];
sparseOffsets = null;
isSparse = false;
}
for (int i = 0; i < dimension; ++i) {
coordinates[i] = Float.parseFloat(entries[i]);
}
}
/**
* Automatically translate sparse format (listing of offsets) into full
* float vector.
*
* The sparse vector is in condensed (signed index + 1) representation, and
* is converted to a full float vector by adding -1 or +1 to the location
* (index - 1) according to the sign of the index. (The -1 and +1 are
* necessary because there is no signed version of 0, so we'd have no way of
* telling that that zeroth position in the array should be plus or minus
* 1.)
*/
protected void sparseToDense() {
if (!isSparse) {
return;
}
coordinates = new float[dimension];
for (int i = 0; i < dimension; ++i) {
coordinates[i] = 0;
}
for (int i = 0; i < sparseOffsets.length; ++i) {
coordinates[Math.abs(sparseOffsets[i]) - 1] = Math.signum(sparseOffsets[i]);
}
isSparse = false;
}
/**
* Available to support access to coordinates for legacy operations. Try not
* to use in new code!
* @return
*/
public float[] getCoordinates() {
if (isSparse) {
RealVector copy = this.copy();
copy.sparseToDense();
return copy.coordinates;
} else {
return coordinates;
}
}
/**
* Available for testing and copying. Try not to use in new code!
* @param coordinates
*/
public RealVector(float[] coordinates) {
this.dimension = coordinates.length;
this.coordinates = coordinates;
}
/**
* Available for testing and copying. Try not to use in new code!
* @param dimension
* @param sparseOffsets
*/
public RealVector(int dimension, short[] sparseOffsets) {
this.isSparse = true;
this.dimension = dimension;
for (Short offset : sparseOffsets) {
if ((offset == 0) || (offset > dimension) || (offset < -1 * dimension)) {
throw new IllegalArgumentException("Offsets too large for dimension!");
}
}
this.sparseOffsets = sparseOffsets;
}
}