MahoutVectorConverter.java example

Explorer
sensim-master
- src
  - main
    - java
      - dima
        CosineDistancePigFunction.java
        MahoutVectorConverter.java
        MatrixMarketStorageWithCounts.java
        UIMAConverterHelper.java
        UIMAXMLConverterHelper.java
      - sensim
        CoreNLPAnnotator.java
        FeatureExtractor.java
        IndexToIndexMapper.java
        JsonLoader.java
        NounPairLabeler.java
        PairSentenceSelector.java
        PukwacReader.java
package dima;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

import java.io.IOException;
import java.util.List;

// This code is taken from the Code for the Elephant Bird Toolset
// originally this class com/twitter/elephantbird/pig/mahout/VectorWritableConverter.java


/**
 * Supports conversion between Pig {@link Tuple} and Mahout {@link VectorWritable}. See
 * {@link #VectorWritableConverter(String[])} for recognized options.
 *
 * The Pig Tuple schema used to represent vector data is determined by the specified options. For
 * instance, if options {@code -dense} and {@code -cardinality 2} are specified, the following
 * schema is assumed:
 *
 * <pre>
 * (double, double)
 * </pre>
 *
 * If option {@code -sparse} is specified without option {@code -cardinality}, the following schema
 * is assumed:
 *
 * <pre>
 * (cardinality: int, entries: {entry: (index: int, value: double)})
 * </pre>
 *
 * If options {@code -sparse} and {@code -cardinality} are both specified, the following schema is
 * assumed:
 *
 * <pre>
 * (entries: {entry: (index: int, value: double)})
 * </pre>
 *
 * Otherwise, no schema is assumed, and organization of vector Tuple data is inspected at runtime.
 *
 * <p>
 * Example usage:
 *
 * <pre>
 * %declare SEQFILE_LOADER 'com.twitter.elephantbird.pig.load.SequenceFileLoader';
 * %declare SEQFILE_STORAGE 'com.twitter.elephantbird.pig.store.SequenceFileStorage';
 * %declare INT_CONVERTER 'com.twitter.elephantbird.pig.util.IntWritableConverter';
 * %declare VECTOR_CONVERTER 'com.twitter.elephantbird.pig.mahout.VectorWritableConverter';
 *
 * -- store DenseVector
 * pair = LOAD '$data' AS (key: int, val: (v1: double, v2: double));
 * STORE pair INTO '$output' USING $SEQFILE_STORAGE (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER'
 * );
 *
 * -- store DenseVector with floats
 * pair = LOAD '$data' AS (key: int, val: (v1: float, v2: float));
 * STORE pair INTO '$output' USING $SEQFILE_STORAGE (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER -- -floatPrecision'
 * );
 *
 * -- store RandomAccessSparseVector data
 * pair = LOAD '$data' AS (key: int, val: (cardinality: int, entries: {entry: (index: int, value: double)}));
 * STORE pair INTO '$output' USING $SEQFILE_STORAGE (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER'
 * );
 *
 * -- store SequentialAccessSparseVector data with the -sequential flag
 * pair = LOAD '$data' AS (key: int, val: (cardinality: int, entries: {entry: (index: int, value: double)}));
 * STORE pair INTO '$output' USING $SEQFILE_STORAGE (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER -- -sequential'
 * );
 *
 * -- load DenseVector data, specifying schema manually
 * pair = LOAD '$data' USING $SEQFILE_LOADER (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER'
 * ) AS (key: int, val: (f1: double, f2: double, f3: double));
 *
 * -- load DenseVector data with known cardinality; Schema defined by SequenceFileLoader
 * pair = LOAD '$data' USING $SEQFILE_LOADER (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER -- -dense -cardinality 2'
 * );
 *
 * -- load *SparseVector data; Schema defined by SequenceFileLoader
 * pair = LOAD '$data' USING $SEQFILE_LOADER (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER -- -sparse'
 * );
 *
 * -- load *SparseVector data with known cardinality; Schema defined by SequenceFileLoader
 * pair = LOAD '$data' USING $SEQFILE_LOADER (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER -- -sparse -cardinality 2'
 * );
 * </pre>
 *
 * Note that conversion of vector data is also supported in cases where it may be desirable to
 * process data in sparse Pig vector format, but input Mahout vector data is actually dense:
 *
 * <pre>
 * pair = LOAD '$dense_vectors' USING $SEQFILE_LOADER (
 *   '-c $INT_CONVERTER', '-c $VECTOR_CONVERTER -- -sparse'
 * );
 * </pre>
 *
 * @author Andy Schlaikjer
 */
public class MahoutVectorConverter {


    private static final String CARDINALITY_PARAM = "cardinality";
    private static final String DENSE_PARAM = "dense";
    private static final String SPARSE_PARAM = "sparse";
    private static final String SEQUENTIAL_PARAM = "sequential";
    private static final String FLOAT_PRECISION_PARAM = "floatPrecision";
    private final TupleFactory tupleFactory = TupleFactory.getInstance();
    private final BagFactory bagFactory = BagFactory.getInstance();
    private final boolean dense;
    private final boolean sparse;
    private final Integer cardinality;
    private final boolean sequential;
    private final boolean floatPrecision;

    /**
     * Default options used.
     *
     * @throws ParseException
     */
    public MahoutVectorConverter() throws ParseException {
        this(new String[] {});
    }

    /**
     * The following options are recognized:
     *
     * <dl>
     * <dt>{@code -cardinality n}</dt>
     * <dd>Vectors are expected to have cardinality {@code n}.</dd>
     * <dt>{@code -dense}</dt>
     * <dd>Vectors are expected to be dense. This option and options {@code -sparse},
     * {@code -sequential} are mutually exclusive.</dd>
     * <dt>{@code -sparse}</dt>
     * <dd>Vectors are expected to be sparse. This option and option {@code -dense} are mutually
     * exclusive.</dd>
     * <dt>{@code -sequential}</dt>
     * <dd>Sparse vector data should be stored using {@link SequentialAccessSparseVector}. This option
     * and option {@code -dense} are mutually exclusive.</dd>
     * <dt>{@code -floatPrecision}</dt>
     * <dd>Vector data should be loaded/stored using float precision.</dd>
     * </dl>
     *
     * @param args options passed in from {@link SequenceFileLoader}.
     * @throws ParseException
     */
    public MahoutVectorConverter(String[] args) throws ParseException {

        Preconditions.checkNotNull(args);
        CommandLine cmdline = parseArguments(args);
        cardinality =
                cmdline.hasOption(CARDINALITY_PARAM) ? new Integer(
                        cmdline.getOptionValue(CARDINALITY_PARAM)) : null;
        dense = cmdline.hasOption(DENSE_PARAM);
        sequential = cmdline.hasOption(SEQUENTIAL_PARAM);
        Preconditions.checkState(!(dense && sequential),
                "Options '-dense' and '-sequential' are mutually exclusive");
        sparse = cmdline.hasOption(SPARSE_PARAM) || sequential;
        Preconditions.checkState(!(dense && sparse),
                "Options '-dense' and '-sparse' are mutually exclusive");
        floatPrecision = cmdline.hasOption(FLOAT_PRECISION_PARAM);
    }

    private CommandLine parseArguments(String[] args) throws ParseException {
        return new GnuParser().parse(getOptions(), args);
    }

    /**
     * @return Options to parse from {@code String[] args} on construction.
     */
    @SuppressWarnings("static-access")
    protected Options getOptions() {
        Options options = new Options();
        options.addOption(OptionBuilder.withLongOpt(CARDINALITY_PARAM).hasArg().withArgName("n")
                .withDescription("Expected cardinality of vector data.").create());
        options
                .addOption(OptionBuilder
                        .withLongOpt(DENSE_PARAM)
                        .withDescription(
                                "If specified along with cardinality, reported LOAD schema will be dense.")
                        .create());
        options.addOption(OptionBuilder.withLongOpt(SPARSE_PARAM)
                .withDescription("If specified, reported LOAD schema will be sparse.").create());
        options.addOption(OptionBuilder
                .withLongOpt(SEQUENTIAL_PARAM)
                .withDescription(
                        "If specified, Pig vector data will be converted to"
                                + " SequentialAccessSparseVector data on STORE."
                                + " Otherwise, RandomAccessSparseVector is used.").create());
        options.addOption(OptionBuilder.withLongOpt(FLOAT_PRECISION_PARAM)
                .withDescription("If specified, float precision will be used when writing output data.")
                .create());
        return options;
    }

    protected Tuple toTuple(Vector v, ResourceFieldSchema schema) throws IOException {

        Preconditions.checkNotNull(v, "Vector is null");

        // check cardinality
        int size = v.size();
        if (cardinality != null) {
            Preconditions.checkState(cardinality == size,
                    "Expecting cardinality %s but found cardinality %s", cardinality, size);
        }

        // create Tuple
        Tuple out = null;
        if (v.isDense()) {
            // dense vector found
            if (sparse) {
                // client requested sparse tuple rep
                out = toSparseVectorTuple(v);
            } else {
                out = toDenseVectorTuple(v);
            }
        } else {
            // sparse vector found
            if (dense) {
                // client requested dense tuple rep
                out = toDenseVectorTuple(v);
            } else {
                out = toSparseVectorTuple(v);
            }
        }
        return out;
    }

    protected Tuple toDenseVectorTuple(Vector v) {
        List<Number> values = Lists.newArrayListWithCapacity(v.size());
        for (Element e : v.all()) {
            values.add(floatPrecision ? (float) e.get() : e.get());
        }
        return tupleFactory.newTupleNoCopy(values);
    }

    protected Tuple toSparseVectorTuple(Vector v) {
        DataBag bag = bagFactory.newDefaultBag();

        for (Element e : v.nonZeroes()) {
            bag.add(tupleFactory.newTupleNoCopy(Lists.<Number> newArrayList(e.index(),
                    floatPrecision ? (float) e.get() : e.get())));
        }
        return cardinality != null ? tupleFactory.newTupleNoCopy(ImmutableList.of(bag)) : tupleFactory
                .newTupleNoCopy(ImmutableList.of(v.size(),
                        bag));
    }



    protected Vector toVector(Tuple value) throws IOException {
        Preconditions.checkNotNull(value, "Tuple is null");
        Vector v = null;
        if (isSparseVectorData(value)) {
            v = convertSparseVectorDataToVector(value);
        } else {
            validateDenseVectorData(value);
            v = convertDenseVectorDataToVector(value);
        }
        return v;
    }

    private static boolean isSparseVectorData(Tuple value) throws IOException {
        assertNotNull(value, "Tuple is null");
        if ((1 == value.size() && DataType.BAG == value.getType(0))
                || (2 == value.size() && DataType.INTEGER == value.getType(0) && DataType.BAG == value
                .getType(1)))
            return true;
        return false;
    }

    private Vector convertSparseVectorDataToVector(Tuple value) throws IOException {
        Vector v;

        // determine output vector size and fetch bag containing entries from input
        int size = 0;
        DataBag entries = null;
        if (value.size() == 2) {
            // cardinality defined by input
            size = (Integer) value.get(0);
            if (cardinality != null) {
                // cardinality defined by VectorWritableConverter instance
                size = cardinality;
            }
            entries = (DataBag) value.get(1);
        } else {
            Preconditions.checkNotNull(cardinality, "Cardinality is undefined");
            size = cardinality;
            entries = (DataBag) value.get(0);
        }

        // create vector, allowing conversion of sparse input vector data to dense output vector
        if (dense) {
            // TODO(Andy Schlaikjer): Test for OOM before it happens
            v = new DenseVector(size);
        } else {
            // more efficient to build sparse vector with this impl
            v = new RandomAccessSparseVector(size);
        }

        // populate vector
        for (Tuple entry : entries) {
            validateSparseVectorEntryData(entry);
            int i = (Integer) entry.get(0);
            // check index bounds
            if (i < 0 || i >= size) {
                //counterHelper.incrCounter(Counter.INDEX_OUT_OF_BOUNDS, 1);
                continue;
            }
            double n = ((Number) entry.get(1)).doubleValue();
            v.setQuick(i, n);
        }

        // convert to (sparse) sequential vector if requested
        if (sequential) {
            v = new SequentialAccessSparseVector(v);
        }

        return v;
    }

    private static void validateSparseVectorEntryData(Tuple value) throws IOException {
        assertNotNull(value, "Tuple is null");
        assertTupleLength(2, value.size(), "tuple");
        assertFieldTypeEquals(DataType.INTEGER, value.getType(0), "tuple[0]");
        assertFieldTypeIsNumeric(value.getType(1), "tuple[1]");
    }

    private static void validateDenseVectorData(Tuple value) throws IOException {
        assertNotNull(value, "Tuple is null");
        for (int i = 0; i < value.size(); ++i) {
            assertFieldTypeIsNumeric(value.getType(i), "tuple[" + i + "]");
        }
    }

    private Vector convertDenseVectorDataToVector(Tuple value) throws IOException {
        Vector v;

        // determine output vector size
        int size = value.size();
        int minSize = size;
        if (cardinality != null && cardinality != size) {
            // cardinality specified on construction overrides instance cardinality
            size = cardinality;
            if (minSize > size) {
                minSize = size;
            }
        }

        // allow conversion of dense vector data to sparse vector
        if (sparse) {
            // this ctor used to pre-alloc space for entries
            v = new RandomAccessSparseVector(size, size);
            for (int i = 0; i < minSize; ++i) {
                v.setQuick(i, ((Number) value.get(i)).doubleValue());
            }
        } else {
            double[] values = new double[size];
            for (int i = 0; i < minSize; ++i) {
                values[i] = ((Number) value.get(i)).doubleValue();
            }
            // this ctor uses values directly, no copying performed
            v = new DenseVector(values, true);
        }

        return v;
    }

    private static void assertNotNull(Object value, String msg, Object... values) throws IOException {
        if (value == null) {
            throw new IOException(String.format(msg, values));
        }
    }

    private static void assertFieldTypeEquals(byte expected, byte observed, String fieldName)
            throws IOException {
        if (expected != observed) {
            throw new IOException(String.format("Expected %s of type '%s' but found type '%s'",
                    fieldName, DataType.findTypeName(expected), DataType.findTypeName(observed)));
        }
    }

    private static void assertFieldTypeIsNumeric(byte observed, String fieldName) throws IOException {
        switch (observed) {
            case DataType.INTEGER:
            case DataType.LONG:
            case DataType.FLOAT:
            case DataType.DOUBLE:
                break;
            default:
                throw new IOException(String.format("Expected %s of numeric type but found type '%s'",
                        fieldName, DataType.findTypeName(observed)));
        }
    }

    private static void assertTupleLength(int expected, int observed, String fieldName)
            throws IOException {
        if (expected != observed) {
            throw new IOException(String.format("Expected %s of length %s but found length %s",
                    fieldName, expected, observed));
        }
    }
}