/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.pubchem;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.FlushOptions;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksIterator;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileAttribute;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
public class PubchemTTLMergerTest {
private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemTTLMergerTest.class);
private static final String TEST_RDF_PATH = "rdf_synonyms";
private static final String THIS_DIR = ".";
private static final String PARENT_DIR = "..";
private Path tempDirPath;
@Before
public void setUp() throws Exception {
// Create a temporary directory where the RocksDB will live.
tempDirPath = Files.createTempDirectory(PubchemTTLMergerTest.class.getName(), new FileAttribute[0]);
}
@After
public void tearDown() throws Exception {
// Clean up temp dir once the test is complete. TODO: use mocks instead maybe? But testing RocksDB helps too...
/* With help from:
* http://stackoverflow.com/questions/779519/delete-directories-recursively-in-java/27917071#27917071 */
Files.walkFileTree(tempDirPath, new FileVisitor<Path>() {
@Override
public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException {
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
// walkFileTree may ignore . and .., but I have never found it a /bad/ idea to check for these special names.
if (!THIS_DIR.equals(file.toFile().getName()) && !PARENT_DIR.equals(file.toFile().getName())) {
Files.delete(file);
}
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
throw exc;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
});
// One last check to make sure the top level directory is removed.
if (tempDirPath.toFile().exists()) {
Files.delete(tempDirPath);
}
}
public List<String> getValForKey(
Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles,
PubchemTTLMerger.COLUMN_FAMILIES columnFamily,
String key
) throws Exception {
RocksDB db = dbAndHandles.getLeft();
String columnFamilyName = columnFamily.getName();
ColumnFamilyHandle cfh = dbAndHandles.getRight().get(columnFamily);
byte[] keyBytes = key.getBytes();
byte[] valBytes = db.get(cfh, keyBytes);
try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) {
return (List<String>) ois.readObject();
}
}
public PubchemSynonyms getPCSyonymsForKey(
Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles,
String key
) throws Exception {
byte[] valBytes = dbAndHandles.getLeft().get(
dbAndHandles.getRight().get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS), key.getBytes());
try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) {
return (PubchemSynonyms) ois.readObject();
}
}
private static String MD51 = "MD5_00000000000000000000000000000001";
private static String MD52 = "MD5_00000000000000000000000000000002";
private static String MD53 = "MD5_00000000000000000000000000000003";
@Test
public void testIndexConstructionAndMerge() throws Exception {
PubchemTTLMerger merger = new PubchemTTLMerger();
Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles =
PubchemTTLMerger.createNewRocksDB(tempDirPath.toFile());
// Alas, we can't swap this with a JAR-safe stream as we must list the files.
File testSynonymFileDir = new File(this.getClass().getResource(TEST_RDF_PATH).getFile());
List<File> testFiles = Arrays.asList(testSynonymFileDir.listFiles());
Collections.sort(testFiles);
Set<String> expectedValues, actualValues;
merger.buildIndex(dbAndHandles, testFiles);
dbAndHandles.getLeft().flush(new FlushOptions());
// Check the hash-to-synonym index.
expectedValues = new HashSet<>(Arrays.asList("test1"));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD51));
assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
expectedValues = new HashSet<>(Arrays.asList("test2"));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD52));
assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
expectedValues = new HashSet<>(Arrays.asList("TEST3", "test3"));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD53));
assertEquals("Third hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
// Now check the MESH index.
expectedValues = new HashSet<>(Arrays.asList("M01"));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_MESH, MD51));
assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
expectedValues = new HashSet<>(Arrays.asList("M02"));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_MESH, MD52));
assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
// Finally (before merging) check the CID to hash index
expectedValues = new HashSet<>(Arrays.asList(MD51));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID01"));
assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
expectedValues = new HashSet<>(Arrays.asList(MD52, MD53));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID02"));
assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
expectedValues = new HashSet<>(Arrays.asList(MD53));
actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID03"));
assertEquals("Third hash-to-synonyms returns expected value(s)", expectedValues, actualValues);
merger.merge(dbAndHandles);
PubchemSynonyms expectedSynonyms, actualSynonyms;
expectedSynonyms = new PubchemSynonyms("CID01");
expectedSynonyms.addMeSHId("M01");
expectedSynonyms.addSynonym(PubchemTTLMerger.PC_SYNONYM_TYPES.TRIVIAL_NAME, "test1");
actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID01");
assertEquals("First CID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms, actualSynonyms);
expectedSynonyms = new PubchemSynonyms("CID02");
expectedSynonyms.addMeSHId("M02");
expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.UNKNOWN, new HashSet<>(Arrays.asList("test2")));
expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME,
new HashSet<>(Arrays.asList("test3", "TEST3")));
actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID02");
assertEquals("Second CID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms, actualSynonyms);
expectedSynonyms = new PubchemSynonyms("CID03");
expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME,
new HashSet<>(Arrays.asList("test3", "TEST3")));
actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID03");
assertEquals("ThirdCID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms, actualSynonyms);
dbAndHandles.getLeft().flush(new FlushOptions());
dbAndHandles.getLeft().close();
}
@Test
public void testValuesAreReadableAfterIndexIsClosedAndReopened() throws Exception {
PubchemTTLMerger merger = new PubchemTTLMerger();
Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles =
PubchemTTLMerger.createNewRocksDB(tempDirPath.toFile());
// Alas, we can't swap this with a JAR-safe stream as we must list the files.
File testSynonymFileDir = new File(this.getClass().getResource(TEST_RDF_PATH).getFile());
List<File> testFiles = Arrays.asList(testSynonymFileDir.listFiles());
Collections.sort(testFiles);
merger.buildIndex(dbAndHandles, testFiles);
merger.merge(dbAndHandles);
dbAndHandles.getLeft().close();
dbAndHandles = merger.openExistingRocksDB(tempDirPath.toFile());
Map<String, PubchemSynonyms> expected = new HashMap<String, PubchemSynonyms>() {{
put("CID01", new PubchemSynonyms("CID01", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {{
put(PubchemTTLMerger.PC_SYNONYM_TYPES.TRIVIAL_NAME, new HashSet<>(Arrays.asList("test1")));
}}, Arrays.asList("M01")));
put("CID02", new PubchemSynonyms("CID02", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {{
put(PubchemTTLMerger.PC_SYNONYM_TYPES.UNKNOWN, new HashSet<>(Arrays.asList("test2")));
put(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME, new HashSet<>(Arrays.asList("TEST3", "test3")));
}}, Arrays.asList("M02")));
put("CID03", new PubchemSynonyms("CID03", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {{
put(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME, new HashSet<>(Arrays.asList("TEST3", "test3")));
}}, Collections.emptyList()));
}};
RocksIterator iterator = dbAndHandles.getLeft().newIterator(
dbAndHandles.getRight().get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS)
);
for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) {
assertNotNull("Iterator key should never be null", iterator.key());
assertNotNull("Iterator value should never be null", iterator.value());
String key = new String(iterator.key());
PubchemSynonyms synonyms;
try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(iterator.value()))) {
// We know all our values so far have been lists of strings, so this should be completely safe.
synonyms = (PubchemSynonyms) ois.readObject();
}
assertEquals(String.format("Pubchem synonyms for %s match expected", key), expected.get(key), synonyms);
}
}
}