/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.pubchem; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.FlushOptions; import org.rocksdb.RocksDB; import org.rocksdb.RocksIterator; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.ObjectInputStream; import java.nio.file.FileVisitResult; import java.nio.file.FileVisitor; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.attribute.FileAttribute; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; public class PubchemTTLMergerTest { private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemTTLMergerTest.class); private static final String TEST_RDF_PATH = "rdf_synonyms"; private static final String THIS_DIR = "."; private static final String PARENT_DIR = ".."; private Path tempDirPath; @Before public void setUp() throws Exception { // Create a temporary directory where the RocksDB will live. tempDirPath = Files.createTempDirectory(PubchemTTLMergerTest.class.getName(), new FileAttribute[0]); } @After public void tearDown() throws Exception { // Clean up temp dir once the test is complete. TODO: use mocks instead maybe? But testing RocksDB helps too... /* With help from: * http://stackoverflow.com/questions/779519/delete-directories-recursively-in-java/27917071#27917071 */ Files.walkFileTree(tempDirPath, new FileVisitor<Path>() { @Override public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { // walkFileTree may ignore . and .., but I have never found it a /bad/ idea to check for these special names. if (!THIS_DIR.equals(file.toFile().getName()) && !PARENT_DIR.equals(file.toFile().getName())) { Files.delete(file); } return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { throw exc; } @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { Files.delete(dir); return FileVisitResult.CONTINUE; } }); // One last check to make sure the top level directory is removed. if (tempDirPath.toFile().exists()) { Files.delete(tempDirPath); } } public List<String> getValForKey( Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES columnFamily, String key ) throws Exception { RocksDB db = dbAndHandles.getLeft(); String columnFamilyName = columnFamily.getName(); ColumnFamilyHandle cfh = dbAndHandles.getRight().get(columnFamily); byte[] keyBytes = key.getBytes(); byte[] valBytes = db.get(cfh, keyBytes); try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) { return (List<String>) ois.readObject(); } } public PubchemSynonyms getPCSyonymsForKey( Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, String key ) throws Exception { byte[] valBytes = dbAndHandles.getLeft().get( dbAndHandles.getRight().get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS), key.getBytes()); try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) { return (PubchemSynonyms) ois.readObject(); } } private static String MD51 = "MD5_00000000000000000000000000000001"; private static String MD52 = "MD5_00000000000000000000000000000002"; private static String MD53 = "MD5_00000000000000000000000000000003"; @Test public void testIndexConstructionAndMerge() throws Exception { PubchemTTLMerger merger = new PubchemTTLMerger(); Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = PubchemTTLMerger.createNewRocksDB(tempDirPath.toFile()); // Alas, we can't swap this with a JAR-safe stream as we must list the files. File testSynonymFileDir = new File(this.getClass().getResource(TEST_RDF_PATH).getFile()); List<File> testFiles = Arrays.asList(testSynonymFileDir.listFiles()); Collections.sort(testFiles); Set<String> expectedValues, actualValues; merger.buildIndex(dbAndHandles, testFiles); dbAndHandles.getLeft().flush(new FlushOptions()); // Check the hash-to-synonym index. expectedValues = new HashSet<>(Arrays.asList("test1")); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD51)); assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues); expectedValues = new HashSet<>(Arrays.asList("test2")); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD52)); assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues); expectedValues = new HashSet<>(Arrays.asList("TEST3", "test3")); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_SYNONYMS, MD53)); assertEquals("Third hash-to-synonyms returns expected value(s)", expectedValues, actualValues); // Now check the MESH index. expectedValues = new HashSet<>(Arrays.asList("M01")); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_MESH, MD51)); assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues); expectedValues = new HashSet<>(Arrays.asList("M02")); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.HASH_TO_MESH, MD52)); assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues); // Finally (before merging) check the CID to hash index expectedValues = new HashSet<>(Arrays.asList(MD51)); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID01")); assertEquals("First hash-to-synonyms returns expected value(s)", expectedValues, actualValues); expectedValues = new HashSet<>(Arrays.asList(MD52, MD53)); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID02")); assertEquals("Second hash-to-synonyms returns expected value(s)", expectedValues, actualValues); expectedValues = new HashSet<>(Arrays.asList(MD53)); actualValues = new HashSet<>(getValForKey(dbAndHandles, PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_HASHES, "CID03")); assertEquals("Third hash-to-synonyms returns expected value(s)", expectedValues, actualValues); merger.merge(dbAndHandles); PubchemSynonyms expectedSynonyms, actualSynonyms; expectedSynonyms = new PubchemSynonyms("CID01"); expectedSynonyms.addMeSHId("M01"); expectedSynonyms.addSynonym(PubchemTTLMerger.PC_SYNONYM_TYPES.TRIVIAL_NAME, "test1"); actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID01"); assertEquals("First CID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms, actualSynonyms); expectedSynonyms = new PubchemSynonyms("CID02"); expectedSynonyms.addMeSHId("M02"); expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.UNKNOWN, new HashSet<>(Arrays.asList("test2"))); expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME, new HashSet<>(Arrays.asList("test3", "TEST3"))); actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID02"); assertEquals("Second CID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms, actualSynonyms); expectedSynonyms = new PubchemSynonyms("CID03"); expectedSynonyms.addSynonyms(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME, new HashSet<>(Arrays.asList("test3", "TEST3"))); actualSynonyms = getPCSyonymsForKey(dbAndHandles, "CID03"); assertEquals("ThirdCID-to-synonyms entry has expected PubchemSynonyms value", expectedSynonyms, actualSynonyms); dbAndHandles.getLeft().flush(new FlushOptions()); dbAndHandles.getLeft().close(); } @Test public void testValuesAreReadableAfterIndexIsClosedAndReopened() throws Exception { PubchemTTLMerger merger = new PubchemTTLMerger(); Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = PubchemTTLMerger.createNewRocksDB(tempDirPath.toFile()); // Alas, we can't swap this with a JAR-safe stream as we must list the files. File testSynonymFileDir = new File(this.getClass().getResource(TEST_RDF_PATH).getFile()); List<File> testFiles = Arrays.asList(testSynonymFileDir.listFiles()); Collections.sort(testFiles); merger.buildIndex(dbAndHandles, testFiles); merger.merge(dbAndHandles); dbAndHandles.getLeft().close(); dbAndHandles = merger.openExistingRocksDB(tempDirPath.toFile()); Map<String, PubchemSynonyms> expected = new HashMap<String, PubchemSynonyms>() {{ put("CID01", new PubchemSynonyms("CID01", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {{ put(PubchemTTLMerger.PC_SYNONYM_TYPES.TRIVIAL_NAME, new HashSet<>(Arrays.asList("test1"))); }}, Arrays.asList("M01"))); put("CID02", new PubchemSynonyms("CID02", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {{ put(PubchemTTLMerger.PC_SYNONYM_TYPES.UNKNOWN, new HashSet<>(Arrays.asList("test2"))); put(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME, new HashSet<>(Arrays.asList("TEST3", "test3"))); }}, Arrays.asList("M02"))); put("CID03", new PubchemSynonyms("CID03", new HashMap<PubchemTTLMerger.PC_SYNONYM_TYPES, Set<String>>() {{ put(PubchemTTLMerger.PC_SYNONYM_TYPES.INTL_NONPROPRIETARY_NAME, new HashSet<>(Arrays.asList("TEST3", "test3"))); }}, Collections.emptyList())); }}; RocksIterator iterator = dbAndHandles.getLeft().newIterator( dbAndHandles.getRight().get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS) ); for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) { assertNotNull("Iterator key should never be null", iterator.key()); assertNotNull("Iterator value should never be null", iterator.value()); String key = new String(iterator.key()); PubchemSynonyms synonyms; try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(iterator.value()))) { // We know all our values so far have been lists of strings, so this should be completely safe. synonyms = (PubchemSynonyms) ois.readObject(); } assertEquals(String.format("Pubchem synonyms for %s match expected", key), expected.get(key), synonyms); } } }