/* * DrakkarKeel - An Enterprise Collaborative Search Platform * * The contents of this file are subject under the terms described in the * DRAKKARKEEL_LICENSE file included in this distribution; you may not use this * file except in compliance with the License. * * 2013-2014 DrakkarKeel Platform. */ package drakkar.mast.recommender; import Jama.Matrix; import Jama.SingularValueDecomposition; import drakkar.oar.DocSuggest; import drakkar.oar.TermSuggest; import drakkar.oar.util.OutputMonitor; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import org.tartarus.snowball.SnowballStemmer; public class LSIManager { private double[][] valsMatrix, termVectors, docVectors; private double[] scales; private int singularValue; private String[] terms, termsStemmer; private DocInfo[] docs; private Class stemClass; private org.tartarus.snowball.SnowballStemmer stemmer; private String language; /** * */ public LSIManager() { } /** * * @param c */ public LSIManager(CollectionInfo c) { valsMatrix = AWEntropy.assignmentOfWeights(c); this.language = "english"; // Matrix a = new Matrix(valsMatrix); // a.print(5, 2); terms = c.getTerms().toArray(new String[0]); singularValue = c.getSingularValue(); docs = c.getDocs().toArray(new DocInfo[0]); } /** * * @param valsMatrix * @param singularValue * @param terms * @param docs */ public LSIManager(double[][] valsMatrix, int singularValue, String[] terms, DocInfo[] docs) { this.valsMatrix = valsMatrix; this.singularValue = singularValue; this.terms = terms; this.docs = docs; this.language = "english"; } /** * Inicializa la matriz de descomposición de valores singulares */ public void initSVDMatrix() { try { Matrix A = new Matrix(valsMatrix); SingularValueDecomposition s = A.svd(); Matrix U = s.getU(); // matriz de términos termVectors = U.getArrayCopy(); double[] singularValues = s.getSingularValues(); // valores sigulares scales = new double[singularValue]; System.arraycopy(singularValues, 0, scales, 0, singularValue); Matrix V = s.getV(); // matriz de documentos docVectors = V.getArrayCopy(); stemClass = Class.forName("org.tartarus.snowball.ext." + language + "Stemmer"); // Class stemClass = Class.forName("org.tartarus.snowball.ext.englishStemmer"); stemmer = (SnowballStemmer) stemClass.newInstance(); this.termsStemmer = stemmerTerms(terms); } catch (InstantiationException ex) { OutputMonitor.printStream("", ex); } catch (IllegalAccessException ex) { OutputMonitor.printStream("", ex); } catch (ClassNotFoundException ex) { OutputMonitor.printStream("", ex); } OutputMonitor.printLine("[Class] LSIManager [Method] initSVDMatrix.", OutputMonitor.TRACE_MESSAGE); } /** * * @param m * @param r * @param c * @return */ public double[][] trunk(double[][] m, int r, int c) { double[][] subm = new double[r][c]; for (int i = 0; i < r; i++) { System.arraycopy(m[i], 0, subm[i], 0, c); } return subm; } /** * * @param m * @param maxFactors * @return */ public double[][] trunk(double[][] m, int maxFactors) { double[][] subm = new double[m.length][maxFactors]; for (int i = 0; i < m.length; i++) { System.arraycopy(m[i], 0, subm[i], 0, maxFactors); } return subm; } /** * Devuelve una lista de sugerencias de términos de consulta, apartir * de la consulta especificada. * * @param query términos de la consulta de búsqueda * * @return términos sugeridos */ public List<TermSuggest> getTermsSuggest(String query) { List<TermSuggest> termsSuggest = new ArrayList<TermSuggest>(); List<String> querys = Arrays.asList(query.split(" |,")); double[] queryVector = new double[singularValue]; Arrays.fill(queryVector, 0.0); for (String term : querys) { addTermVector(term, termVectors, queryVector); } OutputMonitor.printLine("Query=" + querys, OutputMonitor.INFORMATION_MESSAGE); OutputMonitor.printLine("", OutputMonitor.INFORMATION_MESSAGE); System.out.print("Query Vector=("); for (int k = 0; k < queryVector.length; ++k) { if (k > 0) { System.out.print(", "); } System.out.printf("% 5.2f", queryVector[k]); } System.out.println(" )"); // System.out.println("\nTERM SCORES VS. QUERY"); for (int i = 0; i < termVectors.length; ++i) { // double score = dotProduct(queryVector, termVectors[i], scales); double score = cosine(queryVector, termVectors[i], scales); // System.out.printf(" %d: % 5.2f %s\n", i, score, terms[i]); if (score > 0 && !querys.contains(terms[i])) { termsSuggest.add(new TermSuggest(terms[i], score)); } } List<TermSuggest> list = getNewTermsSuggestList(termsSuggest); return list; } /** * Devuelve una lista de sugerencias de documentos, apartir de la consulta especificada. * * @param query términos de la consulta de búsqueda * * @return documentos sugeridos */ public List<DocSuggest> getDocsSuggest(String query) { List<DocSuggest> docsSuggest = new ArrayList<DocSuggest>(); String[] queryTerms = query.split(" |,"); // space or comma separated double[] queryVector = new double[singularValue]; Arrays.fill(queryVector, 0.0); for (String term : queryTerms) { addTermVector(term, termVectors, queryVector); } System.out.println("\nQuery=" + Arrays.asList(queryTerms)); System.out.print("Query Vector=("); for (int k = 0; k < queryVector.length; ++k) { if (k > 0) { System.out.print(", "); } System.out.printf("% 5.2f", queryVector[k]); } System.out.println(" )"); // System.out.println("\nDOCUMENT SCORES VS. QUERY"); for (int j = 0; j < docVectors.length; ++j) { // double score = dotProduct(queryVector, docVectors[j], scales); double score = cosine(queryVector, docVectors[j], scales); // System.out.printf(" %d: % 5.2f %s\n", j, score, docs[j]); if (score > 0) { docsSuggest.add(new DocSuggest(docs[j].getName(), docs[j].getFilePath(), score)); } } List<DocSuggest> list = getNewDocsSuggestList(docsSuggest); OutputMonitor.printLine("[Class] LSIManager [Method] getDocsSuggest.", OutputMonitor.TRACE_MESSAGE); return list; } /** * Devuelve una lista de sugerencias de términos de consulta y documentos, apartir * de la consulta especificada. * * @param query términos de la consulta de búsqueda * * @return términos y documentos sugeridos */ public TermDocSuggest getTermsDocsSuggest(String query) { List<DocSuggest> docsSuggest = new ArrayList<DocSuggest>(); List<TermSuggest> termsSuggest = new ArrayList<TermSuggest>(); // String[] queryTerms = query.split(" |,"); // space or comma separated List<String> querys = Arrays.asList(query.split(" |,")); double[] queryVector = new double[singularValue]; Arrays.fill(queryVector, 0.0); for (String term : querys) { addTermVector(term, termVectors, queryVector); } System.out.println("\nQuery=" + querys); System.out.print("Query Vector=("); for (int k = 0; k < queryVector.length; ++k) { if (k > 0) { System.out.print(", "); } System.out.printf("% 5.2f", queryVector[k]); } System.out.println(" )"); // System.out.println("\nDOCUMENT SCORES VS. QUERY"); for (int j = 0; j < docVectors.length; ++j) { // double score = dotProduct(queryVector, docVectors[j], scales); double score = cosine(queryVector, docVectors[j], scales); // System.out.printf(" %d: % 5.2f %s\n", j, score, docs[j]); if (score > 0) { docsSuggest.add(new DocSuggest(docs[j].getName(), docs[j].getFilePath(), score)); } } // System.out.println("\nTERM SCORES VS. QUERY"); for (int i = 0; i < termVectors.length; ++i) { // double score = dotProduct(queryVector, termVectors[i], scales); double score = cosine(queryVector, termVectors[i], scales); // System.out.printf(" %d: % 5.2f %s\n", i, score, terms[i]); if (score > 0 && !querys.contains(terms[i])) { termsSuggest.add(new TermSuggest(terms[i], score)); } } List<DocSuggest> docsList = getNewDocsSuggestList(docsSuggest); List<TermSuggest> termsList = getNewTermsSuggestList(termsSuggest); TermDocSuggest termsDocsSuggest = new TermDocSuggest(termsList, docsList); OutputMonitor.printLine("[Class] LSIManager [Method] getTermsDocsSuggest.", OutputMonitor.TRACE_MESSAGE); return termsDocsSuggest; } /** * * @param term * @param termVectors * @param queryVector */ public void addTermVector(String term, double[][] termVectors, double[] queryVector) { for (int i = 0; i < termsStemmer.length; ++i) { if (termsStemmer[i].equalsIgnoreCase(stemmerTerm(term))) { // if (terms[i].equals(term)) { for (int j = 0; j < singularValue; ++j) { queryVector[j] += termVectors[i][j]; } return; } } } /** * * @param xs * @param ys * @param scales * @return */ public double dotProduct(double[] xs, double[] ys, double[] scales) { double sum = 0.0; for (int k = 0; k < xs.length; ++k) { sum += xs[k] * ys[k] * scales[k]; } return sum; } /** * * @param xs * @param ys * @param scales * @return */ public double cosine(double[] xs, double[] ys, double[] scales) { double product = 0.0; double xsLengthSquared = 0.0; double ysLengthSquared = 0.0; for (int k = 0; k < xs.length; ++k) { double sqrtScale = Math.sqrt(scales[k]); double scaledXs = sqrtScale * xs[k]; double scaledYs = sqrtScale * ys[k]; xsLengthSquared += scaledXs * scaledXs; ysLengthSquared += scaledYs * scaledYs; product += scaledXs * scaledYs; } return product / Math.sqrt(xsLengthSquared * ysLengthSquared); } // este método devuelve la lista final de términos a sugerir al cliente private List<TermSuggest> getNewTermsSuggestList(List<TermSuggest> termsSuggest) { List<TermSuggest> list; int size = termsSuggest.size(); if (size != 0) { Collections.sort(termsSuggest); if (size > singularValue) { int sizeSuggest = size / (singularValue * 3); if (sizeSuggest > 0) { list = new ArrayList<TermSuggest>(termsSuggest.subList(0, sizeSuggest)); } else { list = new ArrayList<TermSuggest>(termsSuggest.subList(0, singularValue)); } } else { list = termsSuggest; } } else { list = new ArrayList<TermSuggest>(0); } return list; } //TODO arreglar el valor de selection de la lista // este método devuelve la lista final de términos a sugerir al cliente private List<DocSuggest> getNewDocsSuggestList(List<DocSuggest> docsSuggest) { List<DocSuggest> list; int size = docsSuggest.size(); if (size != 0) { Collections.sort(docsSuggest); if (size > singularValue) { int sizeSuggest = size / singularValue; list = docsSuggest.subList(0, sizeSuggest); } else { list = docsSuggest; } } else { list = new ArrayList<DocSuggest>(0); } return list; } private String[] stemmerTerms(String[] terms) { String[] finalTerms = new String[terms.length]; for (int i = 0; i < terms.length; i++) { finalTerms[i] = stemmerTerm(terms[i]); } OutputMonitor.printLine("[Class] LSIManager [Method] stemmerTerms.", OutputMonitor.TRACE_MESSAGE); return finalTerms; } private String stemmerTerm(String term) { stemmer.setCurrent(term); stemmer.stem(); return stemmer.getCurrent(); } /** * * @return */ public int getSingularValue() { return singularValue; } /** * * @param singularValue */ public void setSingularValue(int singularValue) { this.singularValue = singularValue; } /** * * @return */ public double[][] getValsMatrix() { return valsMatrix; } /** * * @param valsMatrix */ public void setValsMatrix(double[][] valsMatrix) { this.valsMatrix = valsMatrix; } /** * * @return */ public String getLanguage() { return language; } /** * * @param language */ public void setLanguage(String language) { this.language = language; } /** * * @return */ public DocInfo[] getDocs() { return docs; } /** * * @param docs */ public void setDocs(DocInfo[] docs) { this.docs = docs; } /** * * @return */ public String[] getTerms() { return terms; } /** * * @param terms */ public void setTerms(String[] terms) { this.terms = terms; } /** * * @param c */ public void setInitValues(CollectionInfo c) { valsMatrix = AWEntropy.assignmentOfWeights(c); language = "english"; terms = c.getTerms().toArray(new String[0]); singularValue = c.getSingularValue(); docs = c.getDocs().toArray(new DocInfo[0]); initSVDMatrix(); } }