ConcatVector.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.loglinear.model; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.loglinear.model.proto.ConcatVectorProto;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.function.Function;

/**
 * Created on 12/7/14.
 * @author keenon
 * <p>
 * Implements a concat vector using an array of arrays, with all its attending resizing efficiencies, and double-pointer
 * inefficiencies. Benchmarking from MinimalML (where I adapted this design from) shows that this is the most efficient
 * of several strategies that can be used to implement this.
 * <p>
 * What is a ConcatVector? Why do I need it?
 * <p>
 * In short, you want this for online learning, where you may not know all your sparse features' sizes at initialization.
 * A concat vector is a vector that behaves like a concatenation of smaller component vectors when you want a dot product.
 * However, it never physically concatenates anything, it just dot products each component, and takes the sum. That way,
 * if you need to expand a component during online learning, it's no problem. As an auxiliary benefit, you can specify
 * sparse and dense components, greatly speeding up dot product calculation when you have lots of sparse features.
 */
public class ConcatVector  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ConcatVector.class);
  double[][] pointers;
  boolean[] sparse;
  boolean[] copyOnWrite;

  /**
   * Constructor that initializes space for this concat vector. Don't worry, it can resize individual elements as
   * necessary but it's most efficient if you get this right at construction.
   *
   * @param numComponents The number of components (usually number of features) to allocate for.
   */
  public ConcatVector(int numComponents) {
    pointers = new double[numComponents][];
    sparse = new boolean[numComponents];
    copyOnWrite = new boolean[numComponents];
  }

  /**
   * Clone a concat vector constructor. Marks both vectors as copyOnWrite, but makes no immediate copies.
   *
   * @param clone the concat vector to clone.
   */
  private ConcatVector(ConcatVector clone) {
    pointers = new double[clone.pointers.length][];
    copyOnWrite = new boolean[clone.pointers.length];
    for (int i = 0; i < clone.pointers.length; i++) {
      if (clone.pointers[i] == null) continue;
      pointers[i] = clone.pointers[i];
      copyOnWrite[i] = true;
      clone.copyOnWrite[i] = true;
    }
    sparse = new boolean[clone.pointers.length];
    if (clone.pointers.length > 0) {
      System.arraycopy(clone.sparse, 0, sparse, 0, clone.pointers.length);
    }
  }

  /**
   * Creates a ConcatVector whose dimensions are the same as this one for all dense components, but is otherwise
   * completely empty. This is useful to prevent resizing during optimizations where we're adding lots of sparse
   * vectors.
   *
   * @return an empty vector suitable for use as a gradient
   */
  public ConcatVector newEmptyClone() {
    ConcatVector clone = new ConcatVector(getNumberOfComponents());
    for (int i = 0; i < pointers.length; i++) {
      if (pointers[i] != null && !sparse[i]) {
        clone.pointers[i] = new double[pointers[i].length];
        clone.sparse[i] = false;
      }
    }
    return clone;
  }

  /**
   * Sets a single component of the concat vector value as a dense vector. This will make a copy of you values array,
   * so you're free to continue mutating it.
   *
   * @param component the index of the component to set
   * @param values    the array of dense values to put into the component
   */
  public void setDenseComponent(int component, double[] values) {
    if (component >= pointers.length) {
      increaseSizeTo(component + 1);
    }
    pointers[component] = values;
    sparse[component] = false;
    copyOnWrite[component] = true;
  }

  /**
   * Sets a single component of the concat vector value as a sparse, one hot value.
   *
   * @param component the index of the component to set
   * @param index     the index of the vector to one-hot
   * @param value     the value of that index
   */
  public void setSparseComponent(int component, int index, double value) {
    if (component >= pointers.length) {
      increaseSizeTo(component + 1);
    }
    double[] sparseInfo = new double[2];
    sparseInfo[0] = index;
    sparseInfo[1] = value;
    pointers[component] = sparseInfo;
    sparse[component] = true;
    copyOnWrite[component] = false;
  }

  /**
   * This function assumes both vectors are infinitely padded with 0s, so it won't complain if there's a dim mismatch.
   * There are no side effects.
   *
   * @param other the MV to dot product with
   * @return the dot product of this and other
   */
  public double dotProduct(ConcatVector other) {
    if (loadedNative) {
      return dotProductNative(other);
    } else {
      double sum = 0.0f;
      for (int i = 0; i < Math.min(pointers.length, other.pointers.length); i++) {
        if (pointers[i] == null || other.pointers[i] == null) continue;
        if (sparse[i] && other.sparse[i]) {
          if ((int) pointers[i][0] == (int) other.pointers[i][0]) {
            sum += pointers[i][1] * other.pointers[i][1];
          }
        } else if (sparse[i] && !other.sparse[i]) {
          int sparseIndex = (int) pointers[i][0];
          if (sparseIndex >= 0 && sparseIndex < other.pointers[i].length) {
            sum += other.pointers[i][sparseIndex] * pointers[i][1];
          }
        } else if (!sparse[i] && other.sparse[i]) {
          int sparseIndex = (int) other.pointers[i][0];
          if (sparseIndex >= 0 && sparseIndex < pointers[i].length) {
            sum += pointers[i][sparseIndex] * other.pointers[i][1];
          }
        } else {
          for (int j = 0; j < Math.min(pointers[i].length, other.pointers[i].length); j++) {
            sum += pointers[i][j] * other.pointers[i][j];
          }
        }
      }
      return sum;
    }
  }

  /**
   * @return a clone of this concat vector, with deep copies of datastructures
   */
  public ConcatVector deepClone() {
    return new ConcatVector(this);
  }

  /**
   * This will add the vector "other" to this vector, scaling other by multiple. In algebra,
   * <p>
   * this = this + (other * multiple)
   * <p>
   * The function assumes that both vectors are padded infinitely with 0s, so will scale this vector by adding components
   * and changing component sizes (dense to bigger dense) and shapes (sparse to dense) in order to accommodate the result.
   *
   * @param other    the vector to add to this one
   * @param multiple the multiple to use
   */
  public void addVectorInPlace(ConcatVector other, double multiple) {
    // Resize if necessary
    if (pointers == null) {
      pointers = new double[other.pointers.length][];
      sparse = new boolean[other.pointers.length];
      copyOnWrite = new boolean[other.pointers.length];
    } else if (pointers.length < other.pointers.length) {
      increaseSizeTo(other.pointers.length);
    }

    // Do the addition piece by piece

    for (int i = 0; i < other.pointers.length; i++) {
      // If the other vector has no segment here, then skip
      if (other.pointers[i] == null) continue;
      // If we previously had no element here, fill it in accordingly
      if (pointers[i] == null || pointers[i].length == 0) {
        sparse[i] = other.sparse[i];
        // If the multiple is one, just follow the copying procedure
        if (multiple == 1.0) {
          pointers[i] = other.pointers[i];
          copyOnWrite[i] = true;
          other.copyOnWrite[i] = true;
        }
        // Otherwise do the standard thing
        else {
          if (other.sparse[i]) {
            pointers[i] = new double[2];
            copyOnWrite[i] = false;
            pointers[i][0] = other.pointers[i][0];
            pointers[i][1] = other.pointers[i][1] * multiple;
          } else {
            pointers[i] = new double[other.pointers[i].length];
            copyOnWrite[i] = false;
            for (int j = 0; j < other.pointers[i].length; j++) {
              pointers[i][j] = other.pointers[i][j] * multiple;
            }
          }
        }
      }
      // Handle rescaling on a component-by-component basis
      else if (sparse[i] && !other.sparse[i]) {
        int sparseIndex = (int) pointers[i][0];
        double sparseValue = pointers[i][1];
        sparse[i] = false;
        pointers[i] = new double[Math.max(sparseIndex + 1, other.pointers[i].length)];
        copyOnWrite[i] = false;
        if (sparseIndex >= 0) {
          pointers[i][sparseIndex] = sparseValue;
        }
        for (int j = 0; j < other.pointers[i].length; j++) {
          pointers[i][j] += other.pointers[i][j] * multiple;
        }
      } else if (sparse[i] && other.sparse[i]) {
        int mySparseIndex = (int) pointers[i][0];
        int otherSparseIndex = (int) other.pointers[i][0];
        if (mySparseIndex == otherSparseIndex) {
          if (copyOnWrite[i]) {
            pointers[i] = pointers[i].clone();
            copyOnWrite[i] = false;
          }
          pointers[i][1] += other.pointers[i][1] * multiple;
        } else {
          sparse[i] = false;
          double mySparseValue = pointers[i][1];
          pointers[i] = new double[Math.max(mySparseIndex + 1, otherSparseIndex + 1)];
          copyOnWrite[i] = false;
          if (mySparseIndex >= 0) {
            pointers[i][mySparseIndex] = mySparseValue;
          }
          if (otherSparseIndex >= 0) {
            pointers[i][otherSparseIndex] = other.pointers[i][1] * multiple;
          }
        }
      } else if (!sparse[i] && other.sparse[i]) {
        int sparseIndex = (int) other.pointers[i][0];
        if (sparseIndex >= pointers[i].length) {
          int newSize = pointers[i].length;
          while (newSize <= sparseIndex) newSize *= 2;
          double[] denseBuf = new double[newSize];
          System.arraycopy(pointers[i], 0, denseBuf, 0, pointers[i].length);
          copyOnWrite[i] = false;
          pointers[i] = denseBuf;
        }
        if (sparseIndex >= 0) {
          if (copyOnWrite[i]) {
            pointers[i] = pointers[i].clone();
            copyOnWrite[i] = false;
          }
          pointers[i][sparseIndex] += other.pointers[i][1] * multiple;
        }
      } else {
        assert (!sparse[i] && !other.sparse[i]);
        if (pointers[i].length < other.pointers[i].length) {
          double[] denseBuf = new double[other.pointers[i].length];
          System.arraycopy(pointers[i], 0, denseBuf, 0, pointers[i].length);
          copyOnWrite[i] = false;
          pointers[i] = denseBuf;
        }
        if (copyOnWrite[i]) {
          pointers[i] = pointers[i].clone();
          copyOnWrite[i] = false;
        }
        for (int j = 0; j < other.pointers[i].length; j++) {
          pointers[i][j] += other.pointers[i][j] * multiple;
        }
      }
    }
  }

  /**
   * This will multiply the vector "other" to this vector. It's the equivalent of the Matlab
   * <p>
   * this = this .* other
   * <p>
   * The function assumes that both vectors are padded infinitely with 0s, so will result in lots of 0s in this
   * vector if it is longer than 'other'.
   *
   * @param other the vector to multiply into this one
   */
  public void elementwiseProductInPlace(ConcatVector other) {
    for (int i = 0; i < pointers.length; i++) {
      if (pointers[i] == null) continue;

      if (copyOnWrite[i]) {
        copyOnWrite[i] = false;
        pointers[i] = pointers[i].clone();
      }

      if (i >= other.pointers.length) {
        if (sparse[i]) {
          pointers[i][1] = 0;
        } else {
          for (int j = 0; j < pointers[i].length; j++) {
            pointers[i][j] = 0;
          }
        }
      } else if (other.pointers[i] == null) {
        pointers[i] = null;
      } else if (sparse[i] && other.sparse[i]) {
        if ((int) pointers[i][0] == (int) other.pointers[i][0]) {
          pointers[i][1] *= other.pointers[i][1];
        } else {
          pointers[i][1] = 0.0f;
        }
      } else if (sparse[i] && !other.sparse[i]) {
        int sparseIndex = (int) pointers[i][0];
        if (sparseIndex >= 0 && sparseIndex < other.pointers[i].length) {
          pointers[i][1] *= other.pointers[i][sparseIndex];
        } else {
          pointers[i][1] = 0.0f;
        }
      } else if (!sparse[i] && other.sparse[i]) {
        int sparseIndex = (int) other.pointers[i][0];
        double sparseValue = 0.0f;
        if (sparseIndex >= 0 && sparseIndex < pointers[i].length) {
          sparseValue = pointers[i][sparseIndex] * other.pointers[i][1];
        }
        sparse[i] = true;
        pointers[i] = new double[]{
            sparseIndex,
            sparseValue
        };
      } else {
        for (int j = 0; j < Math.min(pointers[i].length, other.pointers[i].length); j++) {
          pointers[i][j] *= other.pointers[i][j];
        }
        for (int j = other.pointers[i].length; j < pointers[i].length; j++) {
          pointers[i][j] = 0.0f;
        }
      }
    }
  }

  /**
   * Apply a function to every element of every component of this vector, and replace with the result.
   *
   * @param fn the function to apply to every element of every component.
   */
  public void mapInPlace(Function<Double, Double> fn) {
    for (int i = 0; i < pointers.length; i++) {
      if (pointers[i] == null) continue;

      if (copyOnWrite[i]) {
        copyOnWrite[i] = false;
        pointers[i] = pointers[i].clone();
      }

      if (sparse[i]) {
        pointers[i][1] = fn.apply(pointers[i][1]);
      } else {
        for (int j = 0; j < pointers[i].length; j++) {
          pointers[i][j] = fn.apply(pointers[i][j]);
        }
      }
    }
  }

  /**
   * @return the number of concatenated vectors that compose this ConcatVector
   */
  public int getNumberOfComponents() {
    return pointers.length;
  }

  /**
   * @param i the index of the component to check
   * @return whether component i is sparse or not
   */
  public boolean isComponentSparse(int i) {
    return sparse[i];
  }

  /**
   * This function will throw an assert if the component you're requesting isn't dense
   *
   * @param i the index of the component to look at
   * @return the dense array composing that component
   */
  public double[] getDenseComponent(int i) {
    assert (!sparse[i]);
    // This will save the special case code down the line, so is worth the tiny object creation
    if (pointers[i] == null) return new double[0];
    return pointers[i];
  }

  /**
   * This assumes infinite padding with 0s. It will return you 0 if you're OOB (use getSegmentSizes() to check, if
   * that's undesirable behavior). Otherwise it will return you the correct value.
   *
   * @param component the index of the component to retrieve a value from
   * @param offset    the offset within that component
   * @return the value retrieved, of 0 if OOB
   */
  public double getValueAt(int component, int offset) {
    if (component < pointers.length) {
      if (pointers[component] == null) return 0;
      else if (sparse[component]) {
        int sparseIndex = (int) pointers[component][0];
        if (sparseIndex == offset) return pointers[component][1];
      } else {
        if (offset < pointers[component].length) {
          return pointers[component][offset];
        }
      }
    }
    return 0;
  }

  /**
   * Gets you the index of one hot in a component, assuming it is sparse. Throws an assert if it isn't.
   *
   * @param component the index of the sparse component.
   * @return the index of the one-hot value within that sparse component.
   */
  public int getSparseIndex(int component) {
    assert (sparse[component]);
    return (int) pointers[component][0];
  }

  /**
   * Writes the protobuf version of this vector to a stream. reversible with readFromStream().
   *
   * @param stream the output stream to write to
   * @throws IOException passed through from the stream
   */
  public void writeToStream(OutputStream stream) throws IOException {
    getProtoBuilder().build().writeDelimitedTo(stream);
  }

  /**
   * Static function to deserialize a concat vector from an input stream.
   *
   * @param stream the stream to read from, assuming protobuf encoding
   * @return a new concat vector
   * @throws IOException passed through from the stream
   */
  public static ConcatVector readFromStream(InputStream stream) throws IOException {
    return readFromProto(ConcatVectorProto.ConcatVector.parseDelimitedFrom(stream));
  }

  /**
   * @return a Builder for proto serialization
   */
  public ConcatVectorProto.ConcatVector.Builder getProtoBuilder() {
    ConcatVectorProto.ConcatVector.Builder m = ConcatVectorProto.ConcatVector.newBuilder();
    for (int i = 0; i < pointers.length; i++) {
      ConcatVectorProto.ConcatVector.Component.Builder c = ConcatVectorProto.ConcatVector.Component.newBuilder();
      c.setSparse(sparse[i]);
      // We want to keep the data array size 0 if the pointers for this component is null
      if (pointers[i] != null) {
        for (int j = 0; j < pointers[i].length; j++) {
          c.addData(pointers[i][j]);
        }
      }
      m.addComponent(c);
    }
    return m;
  }

  /**
   * Recreates an in-memory concat vector object from a Proto serialization.
   *
   * @param m the concat vector proto
   * @return an in-memory concat vector object
   */
  public static ConcatVector readFromProto(ConcatVectorProto.ConcatVector m) {
    int components = m.getComponentCount();

    ConcatVector vec = new ConcatVector();
    vec.pointers = new double[components][];
    vec.sparse = new boolean[components];
    for (int i = 0; i < components; i++) {
      ConcatVectorProto.ConcatVector.Component c = m.getComponent(i);
      vec.sparse[i] = c.getSparse();
      int dataSize = c.getDataCount();
      vec.pointers[i] = new double[dataSize];
      for (int j = 0; j < dataSize; j++) {
        vec.pointers[i][j] = c.getData(j);
      }
    }

    return vec;
  }

  /**
   * Compares two concat vectors by value. This means that we're 0 padding, so a dense and sparse component might
   * both be considered the same, if the dense array reflects the same value as the sparse array. This is pretty much
   * only useful for testing. Since it's primarily for testing, we went with the slower, more obviously correct design.
   *
   * @param other     the vector we're comparing to
   * @param tolerance the amount any pair of values can differ before we say the two vectors are different.
   * @return whether the two vectors are the same
   */
  public boolean valueEquals(ConcatVector other, double tolerance) {
    for (int i = 0; i < Math.max(pointers.length, other.pointers.length); i++) {
      int size = 0;
      // Find the maximum non-zero element in this component
      if (i < pointers.length && i < other.pointers.length && pointers[i] == null && other.pointers[i] == null) {
        size = 0;
      } else if (i >= pointers.length || (i < pointers.length && pointers[i] == null)) {
        if (i >= other.pointers.length) {
          size = 0;
        } else if (other.sparse[i]) {
          size = other.getSparseIndex(i) + 1;
        } else {
          size = other.pointers[i].length;
        }
      } else if (i >= other.pointers.length || (i < other.pointers.length && other.pointers[i] == null)) {
        if (i >= pointers.length) {
          size = 0;
        } else if (sparse[i]) {
          size = getSparseIndex(i) + 1;
        } else {
          size = pointers[i].length;
        }
      } else {
        if (sparse[i] && getSparseIndex(i) >= size) size = getSparseIndex(i) + 1;
        else if (!sparse[i] && pointers[i].length > size) size = pointers[i].length;
        if (other.sparse[i] && other.getSparseIndex(i) >= size) size = other.getSparseIndex(i) + 1;
        else if (!other.sparse[i] && other.pointers[i].length > size) size = other.pointers[i].length;
      }

      for (int j = 0; j < size; j++) {
        if (Math.abs(getValueAt(i, j) - other.getValueAt(i, j)) > tolerance) return false;
      }
    }
    return true;
  }

  public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append("[");
    for (int i = 0; i < pointers.length; i++) {
      sb.append(" ..");
      if (pointers[i] == null) {
        sb.append("0=0.0");
      } else if (sparse[i]) {
        sb.append((int) pointers[i][0]).append("=").append(pointers[i][1]);
      } else {
        for (int j = 0; j < pointers[i].length; j++) {
          sb.append(pointers[i][j]);
          if (j != pointers[i].length - 1) sb.append(" ");
        }
      }
      sb.append("..");
    }
    sb.append(" ]");
    return sb.toString();
  }

  ////////////////////////////////////////////////////////////////////////////
  // PRIVATE IMPLEMENTATION
  ////////////////////////////////////////////////////////////////////////////

  /**
   * This increases the length of the vector, while preserving its contents
   *
   * @param newSize the new size to increase to. Must be larger than the current size
   */
  private void increaseSizeTo(int newSize) {
    assert (newSize > pointers.length);
    double[][] pointersBuf = new double[newSize][];
    boolean[] sparseBuf = new boolean[newSize];
    boolean[] copyOnWriteBuf = new boolean[newSize];
    System.arraycopy(pointers, 0, pointersBuf, 0, pointers.length);
    System.arraycopy(sparse, 0, sparseBuf, 0, pointers.length);
    System.arraycopy(copyOnWrite, 0, copyOnWriteBuf, 0, pointers.length);
    pointers = pointersBuf;
    sparse = sparseBuf;
    copyOnWrite = copyOnWriteBuf;
  }

  static boolean loadedNative = false;

  // Right now I'm not loading the native library even if it's available, since the dot product "speedup" is actually
  // 10x slower. First need to diagnose if a speedup is possible by going through the JNI, which is unlikely.

    /*
    static {
        try {
            System.load(System.getProperty("user.dir")+"/src/main/c/libconcatvec.so");
            loadedNative = true;
        }
        catch (UnsatisfiedLinkError e) {
            log.info("Couldn't find the native acceleration library for ConcatVector");
        }
    }
    */

  private native double dotProductNative(ConcatVector other);

  /**
   * DO NOT USE. FOR SERIALIZERS ONLY.
   */
  private ConcatVector() {
  }
}