/*
* Copyright 2014 Radialpoint SafeCare Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.radialpoint.word2vec;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
/**
* This program takes vectors are produced by the C program word2vec and transforms them into a Java binary file to be
* read by the Vectors class
*/
public class ConvertVectors {
/**
* @param args
* the input C vectors file, output Java vectors file
*/
public static void main(String[] args) throws VectorsException, IOException {
float[][] vectors;
String[] vocabVects;
int words;
int size;
File vectorFile = new File(args[0]);
File outputFile = new File(args[1]);
double len;
if (!vectorFile.exists())
throw new VectorsException("Vectors file not found");
FileInputStream fis = new FileInputStream(vectorFile);
StringBuilder sb = new StringBuilder();
char ch = (char) fis.read();
while (ch != '\n') {
sb.append(ch);
ch = (char) fis.read();
}
String line = sb.toString();
String[] parts = line.split("\\s+");
words = (int) Long.parseLong(parts[0]);
size = (int) Long.parseLong(parts[1]);
vectors = new float[words][];
vocabVects = new String[words];
System.out.println("" + words + " words with size " + size + " per vector.");
byte[] orig = new byte[4];
byte[] buf = new byte[4];
for (int w = 0; w < words; w++) {
if (w % (words / 10) == 0) {
System.out.println("Read " + w + " words");
}
sb.setLength(0);
ch = (char) fis.read();
while (!Character.isWhitespace(ch) && ch >= 0 && ch <= 256) {
sb.append((char) ch);
ch = (char) fis.read();
}
ch = (char) fis.read();
String st = sb.toString();
vocabVects[w] = st;
float[] m = new float[size];
for (int i = 0; i < size; i++) {
// read a little endian floating point number and interpret it as a big endian one, see
// http://stackoverflow.com/questions/2782725/converting-float-values-from-big-endian-to-little-endian/2782742#2782742
// NB: this code assumes amd64 architecture
for (int j = 0; j < 4; j++)
orig[j] = (byte) fis.read();
buf[2] = orig[0];
buf[1] = orig[1];
buf[0] = orig[2];
buf[3] = orig[3];
// this code can be made more efficient by reusing the ByteArrayInputStream
DataInputStream dis = new DataInputStream(new ByteArrayInputStream(buf));
m[i] = dis.readFloat();
dis.close();
}
len = 0;
for (int i = 0; i < size; i++)
len += m[i] * m[i];
len = (float) Math.sqrt(len);
for (int i = 0; i < size; i++)
m[i] /= len;
vectors[w] = m;
}
fis.close();
FileOutputStream fos = new FileOutputStream(outputFile);
Vectors instance = new Vectors(vectors, vocabVects);
instance.writeTo(fos);
}
}