package com.cyc.tool.distributedrepresentations;
/*
* #%L
* DistributedRepresentations
* %%
* Copyright (C) 2015 Cycorp, Inc
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import org.mapdb.DBMaker;
/**
* The word2vec space produced by BioASQ by training on pubmed.
*
* <p>
* See:
* http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts
*/
public class BiologyW2VSpace extends Word2VecSpace {
private static final String fileBase = "/cyc/projects/kbTaxonomy/ConceptFinder/BioASQ/word2vecTools/";
private static BiologyW2VSpace singleton;
private static final String w2vlabelfile = fileBase + "types.txt";
private static final String w2vvectorfile = fileBase + "vectors.txt";
private BiologyW2VSpace() throws IOException {
db = DBMaker.newFileDB(new File(Config.getW2vDBFile()))
.closeOnJvmShutdown()
// .encryptionEnable("password")
.make();
vectors = db.getTreeMap(getWord2VecVectorsMapName());
// vectors.clear();
if (!vectors.isEmpty()) {
assert (getVector("anti-mib-1") != null);
setSize(getVector("hgh-b").length);
return;
}
int i = 0;
try (BufferedReader labelReader = new BufferedReader(new FileReader(w2vlabelfile))) {
try (BufferedReader vectorReader = new BufferedReader(new FileReader(w2vvectorfile))) {
for (String label; (label = labelReader.readLine()) != null;) {
String vec = vectorReader.readLine();
float[] d
= normVector(
Arrays.asList(vec.split("\\s+"))
.stream()
.map(s -> Float.valueOf(s))
.collect(Collectors.toList())
);
if (getSize() != 0) {
assert d.length == getSize() : "Line without " + getSize() + " floats";
} else {
setSize(d.length);
}
if (i++ % 100000 == 0) {
db.commit();
System.out.println(i + ": " + label);
}
vectors.put(label, d);
// process the line.
}
// line is not visible here.
}
}
System.out.println("Read " + i + " term positions for " + BiologyW2VSpace.class.getSimpleName());
db.commit();
db.compact();
}
/**
* Factory get method for BiologyW2VSpace.
*
* @return a BiologyW2VSpace
*/
public static BiologyW2VSpace get() {
if (singleton == null) {
try {
singleton = new BiologyW2VSpace();
} catch (IOException ex) {
Logger.getLogger(BiologyW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
throw new RuntimeException("Can't create the Biology W2VSpace object\n " + ex);
}
}
return singleton;
}
/*
@ToDo: change this to use the class name, so that it's automatically correct
*/
private static String getWord2VecVectorsMapName() {
return BiologyW2VSpace.class.getCanonicalName();
}
}