ValenceSpreader.java example

Explorer
Foundry-master
- Components

package gov.sandia.cognition.text.algorithm;

import gov.sandia.cognition.learning.algorithm.minimization.matrix.ConjugateGradientMatrixSolver;
import gov.sandia.cognition.learning.algorithm.semisupervised.valence.MultipartiteValenceMatrix;
import gov.sandia.cognition.learning.data.DefaultInputOutputPair;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.util.Pair;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * This class serves as a wrapper for the MultipartiteValenceMatrix class to
 * simplify the interface for the most common valence task: Ranking a set of
 * documents based on a small set of scored documents and/or a set of scored
 * terms.
 *
 * This algorithm only works when there are some negative scores and some
 * positive scores. However, some datasets (such as ANEW) score from [0 ... 10]
 * or similar. If your labels are like ANEW (with non-balanced scores on a
 * positive/negative scale), you can call centerWeightsRange to make sure there
 * are some negative and some positive scores.
 *
 * Note that this class also serves as an example of how to call
 * MultipartiteValenceMatrix for if you have a different application and just
 * want to see how it's done.
 *
 * @author jdwendt
 */
public class ValenceSpreader<TermType extends Comparable<TermType>, DocIdType extends Comparable<DocIdType>>
{

    /**
     * The (possibly empty) set of terms and their scores passed in by the
     * calling system. Note that either this or weightedDocuments should be
     * non-empty before calling solve.
     */
    private Map<TermType, Pair<Double, Double>> weightedTerms;

    /**
     * The (possibly empty) set of documents and their scores passed in by the
     * calling system. Note that either this or weightedTerms should be
     * non-empty before calling solve.
     */
    private Map<DocIdType, Pair<Double, Double>> weightedDocuments;

    /**
     * The set of documents to rank. The key is the document id. The value is a
     * map with each term in the document is the key and the score is the value
     * (can be passed in as a binary 1/0, TF, TF-IDF, etc.).
     */
    private Map<DocIdType, Map<TermType, Double>> documents;

    /**
     * The accuracy required before declaring that the iterative solver has
     * found the solution.
     */
    private double tolerance;

    /**
     * The number of threads to do for the matrix/vector multiply in the
     * iterative solver. Note that more threads is not necessarily better. In
     * some small problems a single thread or two threads are far better than
     * four threads.
     */
    private int numThreads;

    /**
     * Creates an empty valence spreader. After initialization, documents and
     * some set of scores must be passed in.
     */
    public ValenceSpreader()
    {
        weightedTerms = new HashMap<TermType, Pair<Double, Double>>();
        weightedDocuments = new HashMap<DocIdType, Pair<Double, Double>>();
        documents = new HashMap<DocIdType, Map<TermType, Double>>();
        tolerance = 1e-5;
        numThreads = 2;
    }

    /**
     * Specifies how many threads to use in the matrix/vector multiplies in the
     * iterative solver. Note that more threads is not necessarily better. On
     * many small tests (<100 documents) a single thread has been best. We've
     * run up to several million entries in the matrix (including documents and
     * terms) with only 10-ish threads.
     *
     * Note that you don't need to call this method before solving as it's
     * initialized to a reasonable number of threads (2).
     *
     * @param numThreads The number of threads to use
     */
    public void setNumThreads(int numThreads)
    {
        if (numThreads <= 0)
        {
            throw new IllegalArgumentException("Unable to set the number of "
                + "threads to less than 1");
        }
        this.numThreads = numThreads;
    }

    /**
     * The tolerance that between-iteration error must be below before
     * considering the iterative solver "done". This essentially maps to the L-2
     * error of the result and inversely correlates with how long it takes for
     * the solver to complete. We initialize this to 1e-5, but you can alter
     * that.
     *
     * @param tolerance The error must go below this before the solver completes
     */
    public void setIterativeSolverTolerance(double tolerance)
    {
        if (tolerance <= 0)
        {
            throw new IllegalArgumentException("Unable to set the tolerance "
                + "to a value less than or equal to zero.");
        }
        this.tolerance = tolerance;
    }

    /**
     * Adds the input term with its associated score. Note that this term/score
     * pair will only be used when solving for the system if some document uses
     * that term at least once.
     *
     * @param term The term with the associated score
     * @param score The score for the input term
     */
    public void addWeightedTerm(TermType term,
        double score)
    {
        // This just gives all terms a default trust of 1
        // Note that since trust only matters relative to other trusts (and 
        // that it be positive/non-zero), this just says "trust all scores the 
        // same".
        addWeightedTerm(term, score, 1);
    }

    /**
     * Adds the input term with its associated score and trust level. Note that
     * this term/score/trust tuple will only be used when solving for if some
     * document uses that term at least once.
     *
     * @param term The term with its associated values
     * @param score The score for the input term
     * @param trust The amount to trust the input score. Should be greater than
     * 0. The importance here is how this score ranks relative to the other
     * scores input.
     */
    public void addWeightedTerm(TermType term,
        double score,
        double trust)
    {
        if (trust <= 0)
        {
            throw new IllegalArgumentException("Trust must be greater than 0.  "
                + "Input: " + trust);
        }
        weightedTerms.put(term,
            new DefaultInputOutputPair<Double, Double>(score, trust));
    }

    /**
     * Adds the input documentId with its associated score. Note that this
     * documentId/score will only be used when solving if a document was added
     * with this ID.
     *
     * @param documentId The document id that refers to a document added via one
     * of the addDocumentTerm* methods.
     * @param score The score for the input document
     */
    public void addWeightedDocument(DocIdType documentId,
        double score)
    {
        // This just gives all documents a default trust of 1
        // Note that since trust only matters relative to other trusts (and 
        // that it be positive/non-zero), this just says "trust all scores the 
        // same".
        addWeightedDocument(documentId, score, 1);
    }

    /**
     * Adds the input documentId with its associated score/trust. Note that this
     * will only be used when solving if a document was added with the input ID.
     *
     * @param documentId The document id that refers to a document added via one
     * of the addDocumentTerm* methods.
     * @param score The score for the input document
     * @param trust The amount to trust the input score (should be greater than
     * 0). This only matters in relation to other trust scores -- higher scores
     * are trusted more.
     */
    public void addWeightedDocument(DocIdType documentId,
        double score,
        double trust)
    {
        if (trust <= 0)
        {
            throw new IllegalArgumentException("Trust must be greater than 0.  "
                + "Input: " + trust);
        }
        weightedDocuments.put(documentId,
            new DefaultInputOutputPair<Double, Double>(score, trust));
    }

    /**
     * Adds the input document with all of the input terms in the data. Note
     * that this method and addDocumentTermWeights should be mutually exclusive
     * methods: It doesn't make sense to add one document via this method and
     * another via the other.
     *
     * @param documentId The unique ID for this document. If the same id is used
     * more than once, the earlier data will be replaced with the new data.
     * @param terms The set of terms that occur in the document
     */
    public void addDocumentTermOccurrences(DocIdType documentId,
        Set<TermType> terms)
    {
        Map<TermType, Double> document = new HashMap<TermType, Double>(
            terms.size());
        for (TermType term : terms)
        {
            document.put(term, 1.0);
        }
        documents.put(documentId, document);
    }

    /**
     * Adds the input document with all of the input terms with their input
     * scores (should be greater than 0) to the data. Note that this method and
     * addDocumentTermOccurrences should be mutually exclusive methods: It
     * doesn't make sense to add one document via this method and another via
     * the other.
     *
     * @param documentId The unique ID for this document. If the same id is used
     * more than once, the earlier data will be replaced with the new data.
     * @param terms The set of terms and their associated scores from this
     * document (score can be TF, TF-IDF, etc.)
     */
    public void addDocumentTermWeights(DocIdType documentId,
        Map<TermType, Double> terms)
    {
        documents.put(documentId, new HashMap<TermType, Double>(terms));
    }

    /**
     * Simple helper method that takes an input map of scores and centers the
     * map's values' first elements around zero. The centering is done by
     * remapping current min to -1 and current max to +1 (versus, centering so
     * that the mean is 0).
     *
     * @param m The map to recenter
     */
    private static <Type> void centerMap(Map<Type, Pair<Double, Double>> m)
    {
        double min, max;
        min = Double.MAX_VALUE;
        max = Double.MIN_VALUE;
        for (Pair<Double, Double> p : m.values())
        {
            min = Math.min(p.getFirst(), min);
            max = Math.max(p.getFirst(), max);
        }
        double mult = 2.0 / (max - min);
        Set<Map.Entry<Type, Pair<Double, Double>>> entries = m.entrySet();
        for (Map.Entry<Type, Pair<Double, Double>> e : entries)
        {
            m.put(e.getKey(), new DefaultInputOutputPair<Double, Double>(
                (e.getValue().getFirst() - min) * mult - 1,
                e.getValue().getSecond()));
        }
    }

    /**
     * This algorithm only works when there are some negative scores and some
     * positive scores. However, some datasets (such as ANEW) score from [0 ...
     * 10] or similar. This recenters both the term scores and document scores
     * to go from -1 to 1. Note that the two sets of scores are centered
     * independently, so if you want to have only positive term scores and only
     * negative document scores, don't call this method.
     */
    public void centerWeightsRange()
    {
        centerMap(weightedTerms);
        centerMap(weightedDocuments);
    }

    /**
     * This method solves the system of equations to determine the valence for
     * all documents input and for all terms in those documents. Before callig
     * this method, you should call an addDocumentTerm* method multiple times
     * for all of the documents and call addWeighted* with some positive and
     * negative values passed in. Optionally (if your positive and negative
     * values are all numerically positive) you should call centerWeightsRange
     * also before calling this method.
     *
     * This version uses the default power of 10. This has generally worked well
     * in previous experiments.
     *
     * @return The results of spreading the valence -- The term weights can be
     * used in the future as a classifier; the document weights can be used
     * independently to identify which documents are most extreme on either end.
     */
    public Result<TermType, DocIdType> spreadValence()
    {
        // 10 has been shown to be a good power for most of the text/valence spreading we've done thus far
        return spreadValence(10);
    }

    /**
     * This method solves the system of equations to determine the valence for
     * all documents input and for all terms in those documents. Before callig
     * this method, you should call an addDocumentTerm* method multiple times
     * for all of the documents and call addWeighted* with some positive and
     * negative values passed in. Optionally (if your positive and negative
     * values are all numerically positive) you should call centerWeightsRange
     * also before calling this method.
     *
     * @param power This correlates with how far to spread the influence of the
     * scored values. A power of 0 (not permitted) won't spread at all. A power
     * of 1 will only spread scores from a document to their terms or from terms
     * to their documents. It correlates with the distance of the spread, but
     * does not match it perfectly. In our experience, 10 has been a rather good
     * number for this parameter.
     * @return The results of spreading the valence -- The term weights can be
     * used in the future as a classifier; the document weights can be used
     * independently to identify which documents are most extreme on either end.
     */
    public Result<TermType, DocIdType> spreadValence(int power)
    {
        if (power <= 0)
        {
            throw new IllegalArgumentException("Unable to work with "
                + "non-positive power: " + power);
        }

        int numDocs = documents.size();
        // First get all of the terms in all of the documents
        Set<TermType> allTerms = new HashSet<TermType>();
        for (Map<TermType, Double> document : documents.values())
        {
            allTerms.addAll(document.keySet());
        }
        int numTerms = allTerms.size();
        // Now, put them in some deterministic order
        List<TermType> orderedTerms = new ArrayList<TermType>(allTerms);
        // (I use alphabetical ordering because it's convenient)
        Collections.sort(orderedTerms);
        // The list serves as a forward map (position to term), but I need both possibilities
        Map<TermType, Integer> reverseLookupTerms = new HashMap<TermType, Integer>(
            numTerms);
        for (int i = 0; i < numTerms; ++i)
        {
            reverseLookupTerms.put(orderedTerms.get(i), i);
        }
        // Now, I need an ordered list for the document ids
        List<DocIdType> orderedDocumentIds = new ArrayList<DocIdType>(
            documents.keySet());
        Collections.sort(orderedDocumentIds);
        // And a reverse map
        Map<DocIdType, Integer> reverseLookupDocuments =
            new HashMap<DocIdType, Integer>(numDocs);
        for (int i = 0; i < numDocs; ++i)
        {
            reverseLookupDocuments.put(orderedDocumentIds.get(i), i);
        }

        // Now, I can start putting things in the valence spreading algorithm
        List<Integer> sizes = new ArrayList<Integer>(2);
        sizes.add(numTerms);
        sizes.add(numDocs);
        MultipartiteValenceMatrix mvm = new MultipartiteValenceMatrix(sizes,
            power, numThreads);
        // For all documents...
        for (int i = 0; i < numDocs; ++i)
        {
            // Add all terms that document uses (w/ their scores)
            for (Map.Entry<TermType, Double> term : documents.get(
                orderedDocumentIds.get(i)).entrySet())
            {
                mvm.addRelationship(0, reverseLookupTerms.get(term.getKey()), 1,
                    i, term.getValue());
            }
        }

        // Now, set the initial scores for all of the scores passed in
        for (Map.Entry<TermType, Pair<Double, Double>> e
            : weightedTerms.entrySet())
        {
            Integer idx = reverseLookupTerms.get(e.getKey());
            if (idx != null)
            {
                mvm.setElementsScore(0, idx.intValue(), e.getValue().getSecond(),
                    e.getValue().getFirst());
            }
        }
        for (Map.Entry<DocIdType, Pair<Double, Double>> e
            : weightedDocuments.entrySet())
        {
            Integer idx = reverseLookupDocuments.get(e.getKey());
            mvm.setElementsScore(1, idx.intValue(), e.getValue().getSecond(),
                e.getValue().getFirst());
        }

        // Now, solve the stupid thing!
        Vector rhs = mvm.init();
        ConjugateGradientMatrixSolver s = new ConjugateGradientMatrixSolver(rhs,
            rhs, tolerance);
        Vector result = s.learn(mvm).getOutput();

        // Now pull out all of the scores into my return type
        Result<TermType, DocIdType> r = new Result<TermType, DocIdType>();
        r.termWeights = new HashMap<TermType, Double>(numTerms);
        r.documentWeights = new HashMap<DocIdType, Double>(numDocs);
        for (int i = 0; i < numTerms; ++i)
        {
            r.termWeights.put(orderedTerms.get(i), result.getElement(i));
        }
        for (int i = 0; i < numDocs; ++i)
        {
            r.documentWeights.put(orderedDocumentIds.get(i), result.getElement(
                numTerms + i));
        }

        return r;
    }

    /**
     * The return type from running the spreadValence methods. This reports the
     * weights assigned to all of the input documents and all of the terms that
     * existed in all of the documents.
     */
    public static class Result<TermType, DocIdType>
    {

        /**
         * The weights assigned to all of the terms in all of the input
         * documents.
         */
        public Map<TermType, Double> termWeights;

        /**
         * The weights assigned to all of the input documents.
         */
        public Map<DocIdType, Double> documentWeights;

    }

}