/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.brenda; import org.apache.commons.lang3.StringUtils; import org.rocksdb.ColumnFamilyDescriptor; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.FlushOptions; import org.rocksdb.Options; import org.rocksdb.RocksDB; import org.rocksdb.RocksDBException; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.nio.charset.Charset; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; public class BrendaSupportingEntries { public static final Charset UTF8 = Charset.forName("utf-8"); public static class Ligand { static final String QUERY = "select LigandID, Ligand, inchi, molfile, groupID from ligand_molfiles"; protected Integer ligandId; // The BRENDA identifier for a particular ligand (references one row in the table). protected String ligand; // The textual name of the ligand. protected String inchi; // The InChI for this ligand. protected String molfile; // The contents of the associated MOL file. Not sure what this means yet. protected Integer groupId; // Links this ligand to synonyms (i.e. different ligand name, same InChI). public Ligand(Integer ligandId, String ligand, String inchi, String molfile, Integer groupId) { this.ligandId = ligandId; this.ligand = ligand; this.inchi = inchi; this.molfile = molfile; this.groupId = groupId; } public Integer getLigandId() { return ligandId; } public String getLigand() { return ligand; } public String getInchi() { return inchi; } public String getMolfile() { return molfile; } public Integer getGroupId() { return groupId; } public static Ligand fromResultSet(ResultSet resultSet) throws SQLException { Integer groupId = resultSet.getInt(5); if (groupId.equals(0)) { groupId = null; // A zero group id means no group. } byte[] molfile = resultSet.getBytes(4); return new Ligand( resultSet.getInt(1), resultSet.getString(2), resultSet.getString(3), molfile != null ? new String(molfile, UTF8) : null, groupId ); } } public static class Organism { static final String QUERY = StringUtils.join(new String[]{ "select", // Equivalent of `select distinct` but w/ multiple columns. " Organism,", " Organism_no,", " Organism_ID", "from Organism", "group by Organism, Organism_no, Organism_ID" }, " "); protected String organism; protected String organismNo; protected Integer organismId; public Organism(String organism, String organismNo, Integer organismId) { this.organism = organism; this.organismNo = organismNo; this.organismId = organismId; } public String getOrganism() { return organism; } public String getOrganismNo() { return organismNo; } public Integer getOrganismId() { return organismId; } public static Organism fromResultSet(ResultSet resultSet) throws SQLException { return new Organism( resultSet.getString(1), resultSet.getString(2), resultSet.getInt(3) ); } } public static class Sequence { /* BRENDA's sequence to protein linkage is weak: on the BRENDA site, clicking on the "AA Sequence" link of an * EC Number's page takes you to the complete list of all sequences for that EC Number without reference to the * literature or reaction to which it relates. The BRENDA DB has *some* precise linkage between sequences and * reactions (Substrates_Products in BRENDA parlance) via the `ProtID2SwissProtID` table. This table links * Substrates_Products's is to UniProt ids, which can then be used to look up entries in the `Sequences` page. * Unfortunately, `ProtID2SwissProtID` is very sparsely populated, linking only about 40k reactions to their * sequences by id. To capitalize on the remaining BRENDA sequences (of which there are several millions), we * fall back to linking sequences by EC Number and organism when id-based sequences association is unavailable. * This provides a much weaker guarantee about the catalysis of the enzymes linked to a reaction in the installer * DB, but gives us many more options to work with when designing complete DNA constructs for a complete pathway. */ private static final String QUERY_EXACT = StringUtils.join(new String[]{ "select", " s.ID,", " s.First_Accession_Code,", " s.Entry_Name,", " s.Source,", " s.Sequence", "from Sequence s", "join ProtID2SwissProtID p2s on p2s.UniProt_acc = s.First_Accession_Code", "join Substrates_Products_ps spps on spps.protein_id = p2s.protein_id", "where spps.EC_Number = ?", " and spps.Substrates = ?", " and spps.Products = ?", " and spps.Literature_Substrates = ?", " and spps.Organism_Substrates = ?", " and p2s.first_no = 1" // Necessary to ensure lookup in sequence DB. }, " "); private static final String QUERY_VAGUE = StringUtils.join(new String[]{ "select", " s.ID,", " s.First_Accession_Code,", " s.Entry_Name,", " s.Source,", " s.Sequence", "from Sequence s", "join Substrates_Products sp on sp.EC_Number = s.EC_Number and sp.Organism_Substrates = s.Organism", "where sp.EC_Number = ?", " and sp.Substrates = ?", " and sp.Products = ?", " and sp.Literature_Substrates = ?", " and sp.Organism_Substrates = ?", }, " "); /** * Prepare a query for the sequence based on a reaction, with all required arguments in the query bound to values. * This method lives here to keep the binding next to the statement, which makes sense given the complexity of the * query. * * @param conn A connection on which to prepare the statement. * @param rxnEntry The reaction entry whose sequences to search for. * @return A prepared statement that will fetch results of query. * @throws SQLException */ public static PreparedStatement prepareStatementVague( Connection conn, BrendaRxnEntry rxnEntry, boolean requireExactMatch) throws SQLException { PreparedStatement stmt = conn.prepareStatement(requireExactMatch ? QUERY_EXACT : QUERY_VAGUE); stmt.setString(1, rxnEntry.ecNumber); stmt.setString(2, rxnEntry.substrates); stmt.setString(3, rxnEntry.products); stmt.setString(4, rxnEntry.literatureSubstrates); stmt.setString(5, rxnEntry.organismSubstrates); return stmt; } public static Sequence sequenceFromResultSet(ResultSet resultSet, boolean fromExactMatch) throws SQLException { return new Sequence(resultSet.getInt(1), resultSet.getString(2), resultSet.getString(3), resultSet.getString(4), resultSet.getString(5), fromExactMatch); } protected Integer brendaId; protected String firstAccessionCode; protected String entryName; protected String source; protected String sequence; protected Boolean fromExactMatch; public Sequence(Integer brendaId, String firstAccessionCode, String entryName, String source, String sequence, Boolean fromExactMatch) { this.brendaId = brendaId; this.firstAccessionCode = firstAccessionCode; this.entryName = entryName; this.source = source; this.sequence = sequence; this.fromExactMatch = fromExactMatch; } public Integer getBrendaId() { return brendaId; } public String getFirstAccessionCode() { return firstAccessionCode; } public String getEntryName() { return entryName; } public String getSource() { return source; } public String getSequence() { return sequence; } public Boolean getFromExactMatch() { return fromExactMatch; } } public static class RecommendNameTable { static final String ALL_QUERY = "select EC_Number, Recommended_Name, GO_number from Recommended_Name"; protected Map<String, RecommendName> table = new HashMap<>(7000); // Current Recommend_Name table has ~6700 entries. protected RecommendNameTable() { } protected void addRecommendedName(RecommendName rn) { table.put(rn.ecNumber, rn); } public RecommendName getRecommendedNameForECNumber(String ecNumber) { return this.table.get(ecNumber); } public static RecommendNameTable fetchRecommendedNameTable(Connection conn) throws SQLException { try ( PreparedStatement stmt = conn.prepareStatement(ALL_QUERY); ResultSet resultSet = stmt.executeQuery(); ) { RecommendNameTable table = new RecommendNameTable(); while (resultSet.next()) { String ecNumber = resultSet.getString(1); String recommendedName = resultSet.getString(2); String goNumber = resultSet.getString(3); if ("0".equals(goNumber)) { // The Recommend_Name table allows null values, but missing GO identifiers are specified with '0'. Weird. goNumber = null; } table.addRecommendedName(new RecommendName(ecNumber, recommendedName, goNumber)); } System.out.println("Retrieved " + table.table.size() + " recommended name entries"); return table; } } } public static class RecommendName { protected String ecNumber; protected String recommendedName; protected String goNumber; public RecommendName(String ecNumber, String recommendedName, String goNumber) { this.ecNumber = ecNumber; this.recommendedName = recommendedName; this.goNumber = goNumber; } public String getEcNumber() { return ecNumber; } public String getRecommendedName() { return recommendedName; } public String getGoNumber() { return goNumber; } } /* ****************************** * Result classes for BRENDA data types linked to Substrates_Products entries. * TODO: move these to their own class files if they seem unwieldy. */ /* BRENDA likes to pack lists of literature ids into comma-delimited lists within a single DB field. We need to * ensure that a given row actually references a literature id, so we split the list and search for an exact match * on one of the fields. Crude but effective. */ public static String[] idsFieldToArray(String ids) { return ids.split(", *"); } public static boolean findIdInList(String idList, String idToFind) { String[] ids = idsFieldToArray(idList); for (int i = 0; i < ids.length; i++) { if (idToFind.equals(ids[i])) { return true; } } return false; } public static class PosttranslationalModification implements FromBrendaDB<PosttranslationalModification> { static final String QUERY = StringUtils.join(new String[]{ "SELECT", " Posttranslational_Modification,", " Commentary,", " Literature", "FROM Posttranslational_Modification", "WHERE EC_Number = ? and Literature like ? and Organism = ?"}, " "); public static final String ALL_QUERY = StringUtils.join(new String[]{ "SELECT", " Posttranslational_Modification,", " Commentary,", " Literature,", " EC_Number,", " Organism", "FROM Posttranslational_Modification"}, " "); public static final String COLUMN_FAMILY_NAME = "PosttranslationalModification"; protected static final PosttranslationalModification INSTANCE = new PosttranslationalModification(); protected String commentary; protected String posttranslationalModification; private PosttranslationalModification() {} public PosttranslationalModification(String posttranslationalModification, String commentary) { this.commentary = commentary; this.posttranslationalModification = posttranslationalModification; } public String getCommentary() { return commentary; } public String getPosttranslationalModification() { return posttranslationalModification; } @Override public String getQuery() { return QUERY; } @Override public PosttranslationalModification fromResultSet(ResultSet resultSet) throws SQLException { return new PosttranslationalModification(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class Cloned implements FromBrendaDB<Cloned> { static final String QUERY = StringUtils.join(new String[] { "SELECT", " Commentary,", " Literature", "FROM Cloned", "WHERE EC_Number = ? and Literature like ? and Organism = ?" }, " "); static final String ALL_QUERY = StringUtils.join(new String[] { "SELECT", " Commentary,", " Literature,", " EC_Number,", " Organism", "FROM Cloned" }, " "); static final String COLUMN_FAMILY_NAME = "Cloned"; protected static final Cloned INSTANCE = new Cloned(); protected String commentary; private Cloned() {} public Cloned(String commentary) { this.commentary = commentary; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public Cloned fromResultSet(ResultSet resultSet) throws SQLException { return new Cloned(resultSet.getString(1)); } @Override public int getLiteratureField() { return 2; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 2; } @Override public int getECNumberFieldForAllQuery() { return 3; } @Override public int getOrganismFieldForAllQuery() { return 4; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } // Classes representing data linked to the Substrates_Products and Natural_Substrates_Products tables. public static class KMValue implements FromBrendaDB<KMValue> { static final String QUERY = "select KM_Value, Commentary, Literature from KM_Value " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select KM_Value, Commentary, Literature, EC_Number, Organism from KM_Value"; static final String COLUMN_FAMILY_NAME = "KM_Value"; protected static final KMValue INSTANCE = new KMValue(); private static final long serialVersionUID = 4014251635935240023L; protected Double kmValue; protected String commentary; private KMValue() { } public KMValue(Double kmValue, String commentary) { this.kmValue = kmValue; this.commentary = commentary; } public Double getKmValue() { return kmValue; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public KMValue fromResultSet(ResultSet resultSet) throws SQLException { return new KMValue(resultSet.getDouble(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class SpecificActivity implements FromBrendaDB<SpecificActivity> { public static final String QUERY = "select Specific_Activity, Commentary, Literature from Specific_Activity " + "where EC_Number = ? and Literature like ? and Organism = ?"; public static final String ALL_QUERY = "select Specific_Activity, Commentary, Literature, EC_Number, Organism from Specific_Activity"; public static final String COLUMN_FAMILY_NAME = "Specific_Activity"; protected static final SpecificActivity INSTANCE = new SpecificActivity(); private static final long serialVersionUID = 4017492820129501771L; protected Double specificActivity; protected String commentary; protected SpecificActivity() { } public SpecificActivity(Double specificActivity, String commentary) { this.specificActivity = specificActivity; this.commentary = commentary; } public Double getSpecificActivity() { return specificActivity; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public SpecificActivity fromResultSet(ResultSet resultSet) throws SQLException { return new SpecificActivity(resultSet.getDouble(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class OrganismCommentary implements FromBrendaDB<OrganismCommentary> { static final String QUERY = "select Commentary, Literature from Organism " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select Commentary, Literature, EC_Number, Organism from Organism"; static final String COLUMN_FAMILY_NAME = "Organism_Commentary"; protected static final OrganismCommentary INSTANCE = new OrganismCommentary(); private static final long serialVersionUID = -2085699584700496115L; protected String commentary; protected OrganismCommentary() { } public OrganismCommentary(String commentary) { this.commentary = commentary; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public OrganismCommentary fromResultSet(ResultSet resultSet) throws SQLException { return new OrganismCommentary(resultSet.getString(1)); } @Override public int getLiteratureField() { return 2; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 2; } @Override public int getECNumberFieldForAllQuery() { return 3; } @Override public int getOrganismFieldForAllQuery() { return 4; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class GeneralInformation implements FromBrendaDB<GeneralInformation> { static final String QUERY = "select General_Information, Commentary, Literature from General_Information " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select General_Information, Commentary, Literature, EC_Number, Organism from General_Information"; static final String COLUMN_FAMILY_NAME = "General_Information"; public static final GeneralInformation INSTANCE = new GeneralInformation(); private static final long serialVersionUID = 1157007471920187876L; protected String generalInformation; protected String commentary; protected GeneralInformation() { } public GeneralInformation(String generalInformation, String commentary) { this.generalInformation = generalInformation; this.commentary = commentary; } public String getGeneralInformation() { return generalInformation; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public GeneralInformation fromResultSet(ResultSet resultSet) throws SQLException { return new GeneralInformation(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class Cofactor implements FromBrendaDB<Cofactor> { static final String QUERY = "select Cofactor, Commentary, Literature from Cofactor " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select Cofactor, Commentary, Literature, EC_Number, Organism from Cofactor"; static final String COLUMN_FAMILY_NAME = "Cofactor"; protected static final Cofactor INSTANCE = new Cofactor(); private static final long serialVersionUID = -520309053864923030L; protected String cofactor; protected String commentary; protected Cofactor() { } public Cofactor(String cofactor, String commentary) { this.cofactor = cofactor; this.commentary = commentary; } public String getCofactor() { return cofactor; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public Cofactor fromResultSet(ResultSet resultSet) throws SQLException { return new Cofactor(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class Inhibitors implements FromBrendaDB<Inhibitors> { static final String QUERY = "select Inhibitors, Commentary, Literature from Inhibitors " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select Inhibitors, Commentary, Literature, EC_Number, Organism from Inhibitors"; static final String COLUMN_FAMILY_NAME = "Inhibitors"; protected static final Inhibitors INSTANCE = new Inhibitors(); private static final long serialVersionUID = -6978439225811251470L; protected String inhibitors; protected String commentary; protected Inhibitors() { } public Inhibitors(String inhibitors, String commentary) { this.inhibitors = inhibitors; this.commentary = commentary; } public String getInhibitors() { return inhibitors; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public Inhibitors fromResultSet(ResultSet resultSet) throws SQLException { return new Inhibitors(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class ActivatingCompound implements FromBrendaDB<ActivatingCompound> { static final String QUERY = "select Activating_Compound, Commentary, Literature from Activating_Compound " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select Activating_Compound, Commentary, Literature, EC_Number, Organism from Activating_Compound"; static final String COLUMN_FAMILY_NAME = "Activating_Compound"; protected static final ActivatingCompound INSTANCE = new ActivatingCompound(); private static final long serialVersionUID = -3326349402641253159L; protected String activatingCompound; protected String commentary; protected ActivatingCompound() { } public ActivatingCompound(String activatingCompound, String commentary) { this.activatingCompound = activatingCompound; this.commentary = commentary; } public String getActivatingCompound() { return activatingCompound; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public ActivatingCompound fromResultSet(ResultSet resultSet) throws SQLException { return new ActivatingCompound(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class KCatKMValue implements FromBrendaDB<KCatKMValue> { static final String QUERY = "select KCat_KM_Value, Substrate, Commentary, Literature from KCat_KM_Value " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select KCat_KM_Value, Substrate, Commentary, Literature, EC_Number, Organism from KCat_KM_Value"; static final String COLUMN_FAMILY_NAME = "KCat_KM_Value"; protected static final KCatKMValue INSTANCE = new KCatKMValue(); private static final long serialVersionUID = -9166694433469659408L; protected Double kcatKMValue; protected String substrate; protected String commentary; protected KCatKMValue() { } public KCatKMValue(Double kcatKMValue, String substrate, String commentary) { this.kcatKMValue = kcatKMValue; this.substrate = substrate; this.commentary = commentary; } public Double getKcatKMValue() { return kcatKMValue; } public String getSubstrate() { return substrate; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public KCatKMValue fromResultSet(ResultSet resultSet) throws SQLException { return new KCatKMValue(resultSet.getDouble(1), resultSet.getString(2), resultSet.getString(3)); } @Override public int getLiteratureField() { return 4; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 4; } @Override public int getECNumberFieldForAllQuery() { return 5; } @Override public int getOrganismFieldForAllQuery() { return 6; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class Expression implements FromBrendaDB<Expression> { static final String QUERY = "select Expression, Commentary, Literature from Expression " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select Expression, Commentary, Literature, EC_Number, Organism from Expression"; static final String COLUMN_FAMILY_NAME = "Expression"; protected static final Expression INSTANCE = new Expression(); private static final long serialVersionUID = 49938329620615767L; protected String expression; // TODO: would an enum be better? protected String commentary; protected Expression() { } public Expression(String expression, String commentary) { this.expression = expression; this.commentary = commentary; } public String getExpression() { return expression; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public BrendaSupportingEntries.Expression fromResultSet(ResultSet resultSet) throws SQLException { return new Expression(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class Subunits implements FromBrendaDB<Subunits> { static final String QUERY = "select Subunits, Commentary, Literature from Subunits " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select Subunits, Commentary, Literature, EC_Number, Organism from Subunits"; static final String COLUMN_FAMILY_NAME = "Subunits"; protected static final Subunits INSTANCE = new Subunits(); private static final long serialVersionUID = -3048744632876384569L; protected String subunits; protected String commentary; protected Subunits() { } public Subunits(String subunits, String commentary) { this.subunits = subunits; this.commentary = commentary; } public String getSubunits() { return subunits; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public Subunits fromResultSet(ResultSet resultSet) throws SQLException { return new Subunits(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } public static class Localization implements FromBrendaDB<Localization> { static final String QUERY = "select Localization, Commentary, Literature from Localization " + "where EC_Number = ? and Literature like ? and Organism = ?"; static final String ALL_QUERY = "select Localization, Commentary, Literature, EC_Number, Organism from Localization"; static final String COLUMN_FAMILY_NAME = "Localization"; protected static final Localization INSTANCE = new Localization(); private static final long serialVersionUID = -7765943450126973420L; protected String localization; protected String commentary; protected Localization() { } public Localization(String localization, String commentary) { this.localization = localization; this.commentary = commentary; } public String getLocalization() { return localization; } public String getCommentary() { return commentary; } @Override public String getQuery() { return QUERY; } @Override public Localization fromResultSet(ResultSet resultSet) throws SQLException { return new Localization(resultSet.getString(1), resultSet.getString(2)); } @Override public int getLiteratureField() { return 3; } @Override public String getAllQuery() { return ALL_QUERY; } @Override public int getLiteratureFieldForAllQuery() { return 3; } @Override public int getECNumberFieldForAllQuery() { return 4; } @Override public int getOrganismFieldForAllQuery() { return 5; } @Override public String getColumnFamilyName() { return COLUMN_FAMILY_NAME; } } /* **************************************** * Build an uber-index of all supporting BRENDA tables for faster integration of data into documents. */ public static class IndexWriter<T extends FromBrendaDB<T>> { protected ColumnFamilyHandle columnFamilyHandle; protected RocksDB db; protected T instance; /** * Construct an index writer for a particular FromBrendaDB class. * @param columnFamilyHandle A handle to the column family where this class's data should be written. * @param db A db to which to write data. * @param instance An instance of the class that is to be written. Used for serialization/deserialization. */ public IndexWriter(ColumnFamilyHandle columnFamilyHandle, RocksDB db, T instance) { this.columnFamilyHandle = columnFamilyHandle; this.db = db; this.instance = instance; } /** * Write a K/V pair to the index. * @param key The key to write. * @param val The object to add to a Serializable-serialized list of values. * @throws IOException * @throws ClassNotFoundException * @throws RocksDBException */ private void addObjectToIndex(byte[] key, T val) throws IOException, ClassNotFoundException, RocksDBException { StringBuffer buffer = new StringBuffer(); List<T> storedObjects = null; if (db.keyMayExist(columnFamilyHandle, key, buffer)) { byte[] existingVal = db.get(columnFamilyHandle, key); if (existingVal != null) { ObjectInputStream oi = new ObjectInputStream(new ByteArrayInputStream(existingVal)); storedObjects = (ArrayList<T>) oi.readObject(); // Note: assumes all values are lists. } else { storedObjects = new ArrayList<>(1); } } else { storedObjects = new ArrayList<>(1); } storedObjects.add(val); ByteArrayOutputStream bos = new ByteArrayOutputStream(); ObjectOutputStream oo = new ObjectOutputStream(bos); oo.writeObject(storedObjects); oo.flush(); try { db.put(columnFamilyHandle, key, bos.toByteArray()); } catch (RocksDBException e) { // TODO: do better; throw new IOException(e); } oo.close(); bos.close(); } /** * Make a list of keys + an object for the next result set row. Multiple keys can exist for a given object * thanks to BRENDA's nested literature references. * @param resultSet The result set from which to read a row. * @throws IOException * @throws ClassNotFoundException * @throws RocksDBException * @throws SQLException */ private void createKeysAndWrite(ResultSet resultSet) throws IOException, ClassNotFoundException, RocksDBException, SQLException { String ecNumber = resultSet.getString(instance.getECNumberFieldForAllQuery()); String literatureList = resultSet.getString(instance.getLiteratureFieldForAllQuery()); String organism = resultSet.getString(instance.getOrganismFieldForAllQuery()); T val = instance.fromResultSet(resultSet); List<String> literatureIds = Arrays.asList(idsFieldToArray(literatureList)); for (String literatureId : literatureIds) { byte[] key = makeKey(ecNumber, literatureId, organism); addObjectToIndex(key, val); } } /** * Produce a key for on-disk index lookup using an entry's ec number, liteature reference, and organism name. * @param ecNumber An EC number like 1.1.1.1. * @param literature A literature reference like 123456. * @param organism An organism name like Saccharomyces cerevisiae. * @return A key that can be used to read/write an entry in the on-disk index. */ public static byte[] makeKey(String ecNumber, String literature, String organism) { // Given that ecNumber is always \d.\d.\d.\d and literature is always a number, this should be safe. return StringUtils.join(new String[]{ecNumber, literature, organism}, "::").getBytes(UTF8); } /** * Retrieve all instances of the class defined in the constructor from the BRENDA DB and write the into the index. * @param conn A connection to the BRENDA DB from which to retrieve data. * @throws IOException * @throws ClassNotFoundException * @throws RocksDBException * @throws SQLException */ public void run(Connection conn) throws IOException, ClassNotFoundException, RocksDBException, SQLException { PreparedStatement stmt = conn.prepareStatement(this.instance.getAllQuery()); // "All" query should have no arguments. ResultSet results = stmt.executeQuery(); int processed = 0; while (results.next()) { createKeysAndWrite(results); processed++; if (processed % 10000 == 0) { System.out.println(" processed " + processed + " entries"); } } } } // TODO: this seems suspicious. Is there a better way to organize the various supporting classes? /** * Get a list of empty instances of all classes representing BRENDA MySQL table entries that are keyed on EC number, * literature reference, and organism name. (This corresponds to classes that implement FromBrendaDB.) * * These classes are made available in a collectionway so that callers can iterate over BRENDA tables that contain * reaction-supporting data. * * @return A list of instances of classes that implement FromBrendaDB (may not be exhaustive, but covers BRENDA * tables). */ public static List<FromBrendaDB> allFromBrendaDBInstances() { // Note: update this list whenever new FromBrendaDB classes are created. List<FromBrendaDB> instances = new ArrayList<>(); instances.add(KMValue.INSTANCE); instances.add(Cloned.INSTANCE); instances.add(PosttranslationalModification.INSTANCE); instances.add(SpecificActivity.INSTANCE); instances.add(OrganismCommentary.INSTANCE); instances.add(GeneralInformation.INSTANCE); instances.add(Cofactor.INSTANCE); instances.add(Inhibitors.INSTANCE); instances.add(ActivatingCompound.INSTANCE); instances.add(KCatKMValue.INSTANCE); instances.add(Expression.INSTANCE); instances.add(Subunits.INSTANCE); instances.add(Localization.INSTANCE); return instances; } /** * Create an on-disk index of reaction-supporting data from BRENDA using RocksDB. This DB will contain a number * of `column families` (i.e. per-table namespaces). Each FromBrendaDB instance * * All index rows are keyed on EC number, literature reference (individually, lists are split during construction), * and organism names. Values are serialized (via Serializable) lists of FromBrendaDB objects; each column family * contains one type of object. * * Creating this data on an in-office MBP with a BRENDA MySQL instance running in EC2 takes just a couple of minutes. * Looking up supporting data locally vs. running MySQL queries for every data type * every reaction results in a * ~30x speedup of reaction processing. * * @param pathToIndex The local path where the index should be built. This will become a directory containing * RocksDB files. * @param conn A connection to the BRENDA MySQL DB (`brenda` database) from which data will be read. * @throws IOException * @throws ClassNotFoundException * @throws RocksDBException * @throws SQLException */ public void constructOnDiskBRENDAIndex(File pathToIndex, Connection conn) throws IOException, ClassNotFoundException, RocksDBException, SQLException { if (pathToIndex.exists()) { System.out.println("Index already exists, not recreating."); return; } RocksDB db = null; // Not auto-closable. List<? extends FromBrendaDB> instances = allFromBrendaDBInstances(); try { Options options = new Options().setCreateIfMissing(true); System.out.println("Opening index at " + pathToIndex.getAbsolutePath()); db = RocksDB.open(options, pathToIndex.getAbsolutePath()); for (FromBrendaDB instance : instances) { System.out.println("Writing index for " + instance.getColumnFamilyName()); ColumnFamilyHandle cfh = db.createColumnFamily(new ColumnFamilyDescriptor(instance.getColumnFamilyName().getBytes(UTF8))); IndexWriter writer = new IndexWriter(cfh, db, instance); writer.run(conn); db.flush(new FlushOptions()); } } finally { if (db != null) { db.close(); } } } }