package com.facebook.hive.udf; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import java.util.ArrayList; /** * Performs latent Dirichlet allocation (LDA) inference. Note that this is * *NOT* linear discriminant analysis; this is a mixed-membership model of * discrete data. For details on the model see Latent Dirichlet Allocation by * Blei et al. (2003). The procedure assumes fixed topic multinomials; only * the parameters for a single document (per row) are * inferred. * * Some of the parameters are stored as flattened matrices. A flattened matrix * is a vector representation of a matrix. If the matrix has N rows and M * columns, then the entry in position (i,j) of the original matrix is stored in * entry i + N * j of the flattened matrix. * * The arguments of the UDF are: * * words - an ARRAY of length W containing the words of the document. Each * word is represented by an INT 0-index into the columns of topics. * That is, a value of i in 'words' represents the word indexed by the * ith column of topics. * topics - a flattend matrix of dimension KxV, where K represents the number * of topics and V the size of the vocabulary. The value in the * (i,j)th entry should be proportional to the probability of seeing * word j in topic i. * initial - an initial guess for the document-topic distribution. This should * be an array of DOUBLEs of length K which sums to one. * alpha - Dirichlet hyperparameter for the prior on the document-topic * distribution. * num_iterations - the number of iterations of inference to perform. * * Inference is performed using the CVB0 technique. * * If any of the arguments are NULL then NULL is returned. * * The return value is a vector of length K representing the inferred * topic loadings for that document. It is an array of DOUBLEs whose * entries sum to one. */ @Description(name = "lda", value = "_FUNC_(words, topics, initial, alpha, num_iterations) -" + " Perform LDA inference on a document given by 0-indexed" + " words using the topics (which should be properly " + "normalized and smoothed. Returns the topic proportions.") public class UDFLDA extends UDF { public ArrayList<Double> evaluate( ArrayList<Integer> words, ArrayList<Double> topics, ArrayList<Double> initial, Double alpha, Integer num_iterations) { if (words == null || topics == null || initial == null || alpha == null || num_iterations == null) { return null; } int K = initial.size(); int Nw = words.size(); if (K == 0 || num_iterations <= 0) { return null; } double[] document_sum = new double[K]; double[] assignments = new double[K * Nw]; // Initialize document_sum for (int kk = 0; kk < K; ++kk) { document_sum[kk] = initial.get(kk) * Nw; for (int ww = 0; ww < Nw; ++ww) { assignments[kk + ww * K] = initial.get(kk); } } for (int ii = 0; ii < num_iterations; ++ii) { for (int ww = 0; ww < Nw; ++ww) { int word = words.get(ww); double w_sum = 0.0; for (int kk = 0; kk < K; ++kk) { document_sum[kk] -= assignments[kk + ww * K]; } for (int kk = 0; kk < K; ++kk) { assignments[kk + ww * K] = (document_sum[kk] + alpha) * topics.get(kk + word * K); w_sum += assignments[kk + ww * K]; } for (int kk = 0; kk < K; ++kk) { assignments[kk + ww * K] /= w_sum; document_sum[kk] += assignments[kk + ww * K]; } } } // Normalize document_sum double sum = 0.0; for (int kk = 0; kk < K; ++kk) { sum += document_sum[kk]; } ArrayList<Double> result = new ArrayList<Double>(K); for (int kk = 0; kk < K; ++kk) { if (sum == 0.0) { result.add(1.0 / K); } else { result.add(document_sum[kk] / sum); } } return result; } }