/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import tml.corpus.Corpus;
import Jama.Matrix;
public class LanczosSVDPACKCUtils {
private static Logger logger = Logger.getLogger(LanczosSVDPACKCUtils.class);
private File tmpFolder = new File("lanczos/tmp");
private int numterms = 0;
private int numdocs = 0;
private int singularvalues = -1;
private Matrix u = null;
private Matrix v = null;
private Matrix s = null;
private int rank;
private long lanczosSteps;
private double kappa;
private double arr2double (byte[] arr, int start) {
int i = 0;
int len = 8;
int cnt = 0;
byte[] tmp = new byte[len];
for (i = start; i < (start + len); i++) {
tmp[cnt] = arr[i];
cnt++;
}
long accum = 0;
i = 0;
for ( int shiftBy = 0; shiftBy < 64; shiftBy += 8 ) {
accum |= ( (long)( tmp[i] & 0xff ) ) << shiftBy;
i++;
}
return Double.longBitsToDouble(accum);
}
private long arr2long (byte[] arr, int start) {
int i = 0;
int len = 4;
int cnt = 0;
byte[] tmp = new byte[len];
for (i = start; i < (start + len); i++) {
tmp[cnt] = arr[i];
cnt++;
}
long accum = 0;
i = 0;
for ( int shiftBy = 0; shiftBy < 32; shiftBy += 8 ) {
accum |= ( (long)( tmp[i] & 0xff ) ) << shiftBy;
i++;
}
return accum;
}
private void copy(File src, File dst) throws IOException {
InputStream in = new FileInputStream(src);
OutputStream out = new FileOutputStream(dst);
// Transfer bytes from in to out
byte[] buf = new byte[1024];
int len;
while ((len = in.read(buf)) > 0) {
out.write(buf, 0, len);
}
in.close();
out.close();
}
/**
* @return the kappa
*/
public double getKappa() {
return kappa;
}
/**
* @return the lanczosSteps
*/
public long getLanczosSteps() {
return lanczosSteps;
}
/**
* @return the numdocs
*/
public int getNumdocs() {
return numdocs;
}
/**
* @return the numterms
*/
public int getNumterms() {
return numterms;
}
/**
* @return the rank
*/
public int getRank() {
return rank;
}
/**
* @return the s
*/
public Matrix getS() {
return s;
}
/**
* @return the singularvalues
*/
public int getSingularvalues() {
return singularvalues;
}
/**
* @return the u
*/
public Matrix getU() {
return u;
}
/**
* @return the v
*/
public Matrix getV() {
return v;
}
private double[] readLao(File lao) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(lao));
String line = null;
boolean started = false;
singularvalues = -1;
numterms = 0;
numdocs = 0;
double[] s = null;
int current = 0;
while((line = reader.readLine()) != null) {
if(started) {
if(singularvalues < 0) {
reader.close();
throw new Exception("Corrupt lao file! Couldn't find NSIG line before the singular values!");
}
if(numterms <= 0) {
reader.close();
throw new Exception("Corrupt lao file! Couldn't find TERMS line before the singular values!");
}
if(numdocs <= 0) {
reader.close();
throw new Exception("Corrupt lao file! Couldn't find DOCS line before the singular values!");
}
Pattern pattern = Pattern.compile("^ \\.{6}\\s+\\d+\\s+(\\d+\\.\\d+E[\\+\\-]\\d+)\\s+.*$");
Matcher matcher = pattern.matcher(line);
if(!matcher.matches()) {
reader.close();
throw new Exception("Corrupt lao file!");
}
double value = Double.parseDouble(matcher.group(1));
s[s.length-current-1] = value;
current++;
}
if(line.matches(".*COMPUTED S-VALUES.*")) {
started = true;
reader.readLine();
} else if(line.matches(".*NSIG.*")) {
Pattern pattern = Pattern.compile("^ \\.{6}\\s+NSIG\\s+=\\s*(\\d+)\\s*$");
Matcher matcher = pattern.matcher(line);
if(!matcher.matches()) {
reader.close();
throw new Exception("Corrupt lao file! NSIG");
}
singularvalues = Integer.parseInt(matcher.group(1));
s = new double[singularvalues];
logger.debug("Total singular values:" + singularvalues);
} else if(line.matches(".*NO\\. OF TERMS\\s+\\(ROWS\\).*")) {
Pattern pattern = Pattern.compile("^ \\.{3}\\s+NO\\. OF TERMS\\s+\\(ROWS\\)\\s+=\\s*(\\d+)\\s*$");
Matcher matcher = pattern.matcher(line);
if(!matcher.matches()) {
reader.close();
throw new Exception("Corrupt lao file! NO. OF TERMS");
}
numterms = Integer.parseInt(matcher.group(1));
logger.debug("Total rows:" + numterms);
} else if(line.matches(".*NO\\. OF DOCUMENTS\\s+\\(COLS\\).*")) {
Pattern pattern = Pattern.compile("^ \\.{3}\\s+NO\\. OF DOCUMENTS\\s+\\(COLS\\)\\s+=\\s*(\\d+)\\s*$");
Matcher matcher = pattern.matcher(line);
if(!matcher.matches()) {
reader.close();
throw new Exception("Corrupt lao file! NO. OF COLS");
}
numdocs = Integer.parseInt(matcher.group(1));
logger.debug("Total columns:" + numdocs);
}
}
reader.close();
return s;
}
private void readLav(File lav) throws Exception {
if(singularvalues < 0)
throw new Exception("Corrupt lao file! Couldn't find NSIG line before the singular values!");
if(numterms <= 0)
throw new Exception("Corrupt lao file! Couldn't find TERMS line before the singular values!");
if(numdocs <= 0)
throw new Exception("Corrupt lao file! Couldn't find DOCS line before the singular values!");
FileInputStream reader = new FileInputStream(lav);
int vVectors = 0;
int uVectors = 0;
int currentData = 0;
int blocksize = 0;
byte[] headerbuff = new byte[24];
reader.read(headerbuff);
rank = (int) arr2long(headerbuff, 0);
lanczosSteps = arr2long(headerbuff, 8);
kappa = arr2double(headerbuff, 16);
byte[] buff = new byte[8];
while((blocksize = reader.read(buff)) != -1) {
double num = arr2double(buff, 0);
// HACK! No idea why bytes 24 (D0) and 46 (181) shouldn't be there...
double exp = 0;
String expSt = (new DecimalFormat("0.############E0")).format(num);
if(expSt.split("E").length>1)
exp = Double.parseDouble(expSt.split("E")[1]);
if(Math.abs(exp)>10) {
logger.debug(currentData + " Jumping: " + num);
for(int i=0;i<7;i++) {
buff[i] = buff[i+1];
}
buff[7] = (byte) reader.read();
num = arr2double(buff, 0);
}
int index = currentData % singularvalues;
int indexU = currentData % numterms;
if(vVectors < numdocs)
v.set(numdocs - vVectors - 1, index, num);
else if(uVectors < singularvalues) {
u.set(indexU, singularvalues - uVectors - 1, num);
//logger.debug(currentData + ": " + indexU + "," + (singularvalues - uVectors - 1) + " = " + num);
// v.set(vectors,i % rank,num);
}
if(blocksize != 8)
logger.debug("Some size lost " + blocksize);
currentData++;
if(vVectors < numdocs) {
if(currentData % singularvalues == 0) {
vVectors++;
if(vVectors == numdocs)
currentData = 0;
}
} else {
if(currentData % numterms == 0)
uVectors++;
}
}
v = v.transpose();
logger.debug("Vectors:" + vVectors);
reader.close();
}
public void runLanczos(Corpus corpus) throws Exception {
for(File f : tmpFolder.listFiles()) {
if(!f.isDirectory())
f.delete();
}
writeCorpusParametersForLanczos(corpus, "lap2", corpus.getSemanticSpace().getDimensionsKept());
writeCorpusInHBFormat(corpus, "matrix");
File lanczosExec = new File("lanczos/windows/las2.exe");
File exec = new File(tmpFolder.getAbsolutePath() + "/las2.exe");
copy(lanczosExec, exec);
String ls_str;
long time = System.nanoTime();
Process ls_proc = Runtime.getRuntime().exec(tmpFolder.getAbsolutePath() + "/las2.exe");
// get its output (your input) stream
BufferedReader reader = new BufferedReader(new InputStreamReader(ls_proc.getInputStream()));
while ((ls_str = reader.readLine()) != null) {
logger.debug(ls_str);
}
ls_proc.waitFor();
time = System.nanoTime() - time;
logger.debug("Lanczos took " + (time * 10E-9) + " millis");
File matrix = new File("matrix");
matrix.renameTo(new File(tmpFolder.getAbsolutePath() + "/matrix"));
File lap2 = new File("lap2");
lap2.renameTo(new File(tmpFolder.getAbsolutePath() + "/lap2"));
File lao2 = new File("lao2");
lao2.renameTo(new File(tmpFolder.getAbsolutePath() + "/lao2"));
lao2 = new File(tmpFolder.getAbsolutePath() + "/lao2");
File lav2 = new File("lav2");
lav2.renameTo(new File(tmpFolder.getAbsolutePath() + "/lav2"));
lav2 = new File(tmpFolder.getAbsolutePath() + "/lav2");
double[] singulars = readLao(lao2);
if(singulars == null)
throw new Exception("Lanczos failed execution, please check the logs");
s = new Matrix(singulars.length,singulars.length);
for(int i=0;i<s.getColumnDimension();i++) {
s.set(i, i, singulars[i]);
}
u = new Matrix(numterms,singularvalues);
v = new Matrix(numdocs,singularvalues);
readLav(lav2);
}
private void writeCorpusInHBFormat(Corpus corpus, String filename) throws Exception {
FileWriter writer = new FileWriter(new File(filename));
writer.append("Learning Systems Group University of Sydney matrix\n");
writer.append("#\n");
String rowsAndColsline = "rra ";
rowsAndColsline += corpus.getTerms().length + " ";
rowsAndColsline += corpus.getPassages().length + " ";
rowsAndColsline += corpus.getNonzeros() + " ";
while(rowsAndColsline.length() < 79)
rowsAndColsline += " ";
rowsAndColsline += "0\n";
writer.append(rowsAndColsline);
writer.append(" (10i8) (10i8) (8f10.3) (8f10.3)\n");
List<Integer> indices = new ArrayList<Integer>();
List<Integer> termindices = new ArrayList<Integer>();
List<Double> values = new ArrayList<Double>();
Matrix m = corpus.getTermDocMatrix();
indices.add(1);
int acumnonzeros = 1;
for(int doc = 0; doc<m.getColumnDimension(); doc++) {
int nonzeros = 0;
for(int term = 0; term<m.getRowDimension(); term++) {
if(m.get(term, doc) != 0) {
termindices.add(term + 1);
values.add(new Double(m.get(term, doc)));
nonzeros++;
}
}
acumnonzeros += nonzeros;
indices.add(acumnonzeros);
}
String indicesLine = " ";
for(int i = 0; i<indices.size(); i++) {
int ind = indices.get(i);
indicesLine += Integer.toString(ind) + " ";
if(indicesLine.length() > 75 || i == indices.size()-1) {
indicesLine += "\n";
writer.append(indicesLine);
indicesLine = "";
}
}
String termIndicesLine = " ";
for(int i=0; i<termindices.size(); i++) {
termIndicesLine += termindices.get(i) + " ";
if(termIndicesLine.length() > 75 || i == termindices.size()-1) {
termIndicesLine += "\n";
writer.append(termIndicesLine);
termIndicesLine = " ";
}
}
String valuesLine = " ";
for(int i=0; i<values.size(); i++) {
valuesLine += (new DecimalFormat("0.000")).format(values.get(i)) + " ";
if(valuesLine.length() > 75 || i == values.size()-1) {
valuesLine += "\n";
writer.append(valuesLine);
valuesLine = " ";
}
}
writer.close();
}
private void writeCorpusParametersForLanczos(Corpus corpus, String filename, int maxDimensionality) throws Exception {
FileWriter writer = new FileWriter(new File(filename));
int rank = Math.min(corpus.getPassages().length, corpus.getTerms().length);
writer.append("'matrix' " + rank + " " + maxDimensionality + " -1.0e-30 1.0e-30 TRUE 1.0e-6 0");
writer.close();
}
}