/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import tml.Configuration;
import tml.corpus.Corpus;
import Jama.Matrix;
public class LanczosSVDLIBCUtils {
private static Logger logger = Logger.getLogger(LanczosSVDLIBCUtils.class);
private File tmpFolder = null;
private int numterms = 0;
private int numdocs = 0;
private int singularvalues = -1;
private Matrix u = null;
private Matrix v = null;
private Matrix s = null;
private int rank;
private long lanczosSteps;
private double kappa;
private String osfolder = "windows";
private static final String WIN32_EXEC = "svd.exe";
private static final String LINUX_EXEC = "svd";
private String executable = WIN32_EXEC;
private static final String WIN32_NEWLINE = "\r\n";
private static final String LINUX_NEWLINE = "\n";
private String newLine = WIN32_NEWLINE;
private static final String TMP_FOLDER = "tmp";
private String baseFolder = "lanczos";
/**
* @return the kappa
*/
public double getKappa() {
return kappa;
}
/**
* @return the lanczosSteps
*/
public long getLanczosSteps() {
return lanczosSteps;
}
/**
* @return the numdocs
*/
public int getNumdocs() {
return numdocs;
}
/**
* @return the numterms
*/
public int getNumterms() {
return numterms;
}
/**
* @return the rank
*/
public int getRank() {
return rank;
}
/**
* @return the s
*/
public Matrix getS() {
return s;
}
/**
* @return the singularvalues
*/
public int getSingularvalues() {
return singularvalues;
}
/**
* @return the u
*/
public Matrix getU() {
return u;
}
/**
* @return the v
*/
public Matrix getV() {
return v;
}
private Matrix readDenseMatrix(File file, Corpus corpus) throws IOException {
Matrix m = null;
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
String[] parts = line.split("\\s+");
int rows;
int columns;
boolean matrixS = false;
if(parts.length == 1) {
rows = Integer.parseInt(parts[0]);
if(rows != corpus.getSemanticSpace().getDimensionsKept()) {
logger.debug("Found less singular values than solicited. " + rows + " out of " + corpus.getSemanticSpace().getDimensionsKept());
rows = corpus.getSemanticSpace().getDimensionsKept();
}
columns = rows;
matrixS = true;
} else if(parts.length == 2) {
rows = Integer.parseInt(parts[0]);
columns = Integer.parseInt(parts[1]);
} else {
reader.close();
logger.error("Invalid format of first line!");
return null;
}
m = new Matrix(rows, columns);
int lineNumber = 0;
while((line = reader.readLine()) != null) {
String[] lineparts = line.split("\\s+");
if((!matrixS && lineparts.length != columns)
|| (matrixS && lineparts.length != 1)){
logger.error("Invalid matrix file! " + line);
reader.close();
throw new IOException("Invalid file");
}
if(matrixS)
for(int col=0;col<lineparts.length;col++) {
m.set(lineNumber, lineNumber, Double.parseDouble(lineparts[col]));
}
else
for(int col=0;col<lineparts.length;col++) {
m.set(lineNumber, col, Double.parseDouble(lineparts[col]));
}
if(lineNumber > rows)
logger.error("Longer file! Extra line:" + line);
lineNumber++;
}
reader.close();
logger.debug(file.getName() + " done! " + lineNumber + " lines processed");
return m;
}
public LanczosSVDLIBCUtils() throws IOException {
baseFolder = Configuration.getTmlFolder() + "/lanczos";
if(System.getProperty("os.name").startsWith("Windows")) {
this.osfolder = "windows";
this.executable = baseFolder + "/" + this.osfolder + "/" + WIN32_EXEC;
this.newLine = WIN32_NEWLINE;
}
else {
this.osfolder = "linux";
this.executable = baseFolder + "/" + this.osfolder + "/" + LINUX_EXEC;
this.newLine = LINUX_NEWLINE;
}
}
public void runLanczos(Corpus corpus, String svdFilename) throws Exception {
this.tmpFolder = new File(baseFolder + "/" + TMP_FOLDER);
// Delete output and matrix files
for(File f : (new File(baseFolder + "/" + TMP_FOLDER)).listFiles()) {
if(f.getName().equals(svdFilename + ".matrix")
|| f.getName().equals(svdFilename + "-Ut")
|| f.getName().equals(svdFilename + "-Vt")
|| f.getName().equals(svdFilename + "-S")) {
f.delete();
}
}
writeCorpusInHBFormat(corpus, this.tmpFolder.getAbsolutePath() + "/" + svdFilename + ".matrix");
File lanczosExec = new File(this.executable);
String ls_str;
long time = System.nanoTime();
String matrixFile = this.tmpFolder.getAbsolutePath() + "/" + svdFilename + ".matrix";
if(this.osfolder.equals("windows"))
matrixFile = "\"" + matrixFile + "\"";
String outFolder = this.tmpFolder.getAbsolutePath() + "/" + svdFilename;
if(this.osfolder.equals("windows"))
outFolder = "\"" + outFolder + "\"";
String linexec = lanczosExec.getAbsolutePath()
+ " -d " + corpus.getSemanticSpace().getDimensionsKept()
+ " -o " + outFolder
+ " -r sth "
+ " -w dt " + matrixFile;
logger.debug("Executing: " + linexec);
Process ls_proc = Runtime.getRuntime().exec(linexec);
// get its output (your input) stream
BufferedReader readerInput = new BufferedReader(new InputStreamReader(ls_proc.getInputStream()));
BufferedReader readerErr = new BufferedReader(new InputStreamReader(ls_proc.getErrorStream()));
BufferedReader reader = null;
while(!readerInput.ready() && !readerErr.ready() && (System.nanoTime() - time) <= 10E13);
if((System.nanoTime() - time) > 10E13) {
logger.error("Timeout trying to execute Lanczos");
throw new Exception("Timeout trying to execute Lanczos");
}
if(readerInput.ready()) {
reader = readerInput;
readerErr.close();
} else {
reader = readerErr;
readerInput.close();
}
while (reader.ready() && (ls_str = reader.readLine()) != null) {
logger.debug(ls_str);
Pattern pattern = Pattern.compile("^\\s*SINGULAR VALUES FOUND\\s+=\\s*(\\d+)\\s*$");
Matcher matcher = pattern.matcher(ls_str);
if(matcher.matches())
singularvalues = Integer.parseInt(matcher.group(1));
}
ls_proc.waitFor();
time = System.nanoTime() - time;
logger.debug("Lanczos took " + (time * 10E-9) + " millis");
u = readDenseMatrix(new File(baseFolder + "/" + TMP_FOLDER + "/" + svdFilename + "-Ut"), corpus).transpose();
s = readDenseMatrix(new File(baseFolder + "/" + TMP_FOLDER + "/" + svdFilename + "-S"), corpus);
v = readDenseMatrix(new File(baseFolder + "/" + TMP_FOLDER + "/" + svdFilename + "-Vt"), corpus).transpose();
// Delete output and matrix files
for(File f : (new File(baseFolder + "/" + TMP_FOLDER)).listFiles()) {
if(f.getName().equals(svdFilename + ".matrix")
|| f.getName().equals(svdFilename + "-Ut")
|| f.getName().equals(svdFilename + "-Vt")
|| f.getName().equals(svdFilename + "-S")) {
f.delete();
}
}
}
private void writeCorpusInHBFormat(Corpus corpus, String filename) throws Exception {
FileWriter writer = new FileWriter(new File(filename));
writer.append("Learning Systems Group University of Sydney matrix" + this.newLine);
writer.append("#" + this.newLine);
String rowsAndColsline = "rra ";
rowsAndColsline += corpus.getTerms().length + " ";
rowsAndColsline += corpus.getPassages().length + " ";
rowsAndColsline += corpus.getNonzeros() + " ";
while(rowsAndColsline.length() < 79)
rowsAndColsline += " ";
rowsAndColsline += "0" + this.newLine;
writer.append(rowsAndColsline);
writer.append(" (10i8) (10i8) (8f10.3) (8f10.3)" + this.newLine);
List<Integer> indices = new ArrayList<Integer>();
List<Integer> termindices = new ArrayList<Integer>();
List<Double> values = new ArrayList<Double>();
Matrix m = corpus.getTermDocMatrix();
if(m.get(0, 0) > 0)
indices.add(1);
int acumnonzeros = 1;
for(int doc = 0; doc<m.getColumnDimension(); doc++) {
int nonzeros = 0;
for(int term = 0; term<m.getRowDimension(); term++) {
if(m.get(term, doc) != 0) {
termindices.add(term + 1);
values.add(new Double(m.get(term, doc)));
nonzeros++;
}
}
acumnonzeros += nonzeros;
indices.add(acumnonzeros);
}
String indicesLine = " ";
for(int i = 0; i<indices.size(); i++) {
int ind = indices.get(i);
indicesLine += Integer.toString(ind) + " ";
if(indicesLine.length() > 75 || i == indices.size()-1) {
indicesLine += this.newLine;
writer.append(indicesLine);
indicesLine = "";
}
}
String termIndicesLine = " ";
for(int i=0; i<termindices.size(); i++) {
termIndicesLine += termindices.get(i) + " ";
if(termIndicesLine.length() > 75 || i == termindices.size()-1) {
termIndicesLine += this.newLine;
writer.append(termIndicesLine);
termIndicesLine = " ";
}
}
String valuesLine = " ";
for(int i=0; i<values.size(); i++) {
valuesLine += (new DecimalFormat("0.000")).format(values.get(i)) + " ";
if(valuesLine.length() > 75 || i == values.size()-1) {
valuesLine += this.newLine;
writer.append(valuesLine);
valuesLine = " ";
}
}
writer.close();
logger.debug("Matrix file written. " + indices.size() + " indices "
+ termindices.size() + " term indices " + values.size() + " values");
}
}