/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.sars;
import act.server.MongoDB;
import act.shared.Seq;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* A sequence grouper that iterates over the seq DB and groups only seq entries that have exactly same sequence.
*/
public class SeqDBReactionGrouper {
private static final Logger LOGGER = LogManager.getFormatterLogger(SeqDBReactionGrouper.class);
private static final String OPTION_DB = "db";
private static final String OPTION_OUTPUT_PATH = "o";
private static final String OPTION_LIMIT = "l";
private static final String OPTION_HELP = "h";
public static final String HELP_MESSAGE =
"This class is used to generate reaction groups by scanning the seq DB for sequences that point to multiple " +
"reactions. Options are supplied to indicate how far into the DB to scan, which DB to use, and where to " +
"write the output.";
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_DB)
.argName("db name")
.desc("The name of the mongo DB to use.")
.hasArg()
.longOpt("db-name")
.required(true)
);
add(Option.builder(OPTION_OUTPUT_PATH)
.argName("output file path")
.desc("The absolute path to the file to which to write the json file of the reaction group corpus.")
.hasArg()
.longOpt("output-file-path")
.required(true)
);
add(Option.builder(OPTION_LIMIT)
.argName("seq limit")
.desc("The maximum number of seq entries to process. This is useful because running on the entire DB can " +
"require a lot of time and memory.")
.hasArg()
.longOpt("seq-limit")
.type(Integer.class)
);
add(Option.builder(OPTION_HELP)
.argName("help")
.desc("Prints this help message.")
.longOpt("help")
);
}};
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
private static final String LOCAL_HOST = "localhost";
private static final Integer MONGO_PORT = 27017;
private static final Integer DEFAULT_LIMIT_INFINITY = Integer.MAX_VALUE;
public static void main(String[] args) throws Exception {
// Build command line parser.
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
LOGGER.error("Argument parsing failed: %s", e.getMessage());
HELP_FORMATTER.printHelp(SeqDBReactionGrouper.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
// Print help.
if (cl.hasOption(OPTION_HELP)) {
HELP_FORMATTER.printHelp(SeqDBReactionGrouper.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
return;
}
// Handle arguments
String mongoDBName = cl.getOptionValue(OPTION_DB);
MongoDB mongoDB = new MongoDB(LOCAL_HOST, MONGO_PORT, mongoDBName);
File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_PATH));
if (outputFile.isDirectory() || outputFile.exists()) {
LOGGER.error("Supplied output file is a directory or already exists.");
System.exit(1);
}
outputFile.createNewFile();
Integer limit = DEFAULT_LIMIT_INFINITY;
if (cl.hasOption(OPTION_LIMIT)) {
limit = Integer.parseInt(cl.getOptionValue(OPTION_LIMIT));
}
LOGGER.info("Only processing first %d entries in Seq DB.", limit);
SeqDBReactionGrouper enzymeGrouper = new SeqDBReactionGrouper(mongoDB.getSeqIterator(), mongoDBName, limit);
LOGGER.info("Scanning seq db for reactions with same seq.");
ReactionGroupCorpus groupCorpus = enzymeGrouper.getReactionGroupCorpus();
LOGGER.info("Writing output to file.");
groupCorpus.printToJsonFile(outputFile);
LOGGER.info("Complete!");
}
final Integer limit;
final String dbName;
final Iterator<Seq> seqIterator;
/**
* Builds a SeqDBReactionGrouper for the given Seq entries.
*
* @param seqIterator The Seq entries to group.
* @param limit The maximum number of entries to process. This can be used to limit memory and time.
*/
public SeqDBReactionGrouper(Iterator<Seq> seqIterator, String dbName, Integer limit) {
this.seqIterator = seqIterator;
this.dbName = dbName;
this.limit = limit;
}
/**
* Builds a SeqDBReactionGrouper for the given Seq entries.
*
* @param seqIterator The Seq entries to group.
*/
public SeqDBReactionGrouper(Iterator<Seq> seqIterator, String dbName) {
this(seqIterator, dbName, DEFAULT_LIMIT_INFINITY);
}
/**
* Returns the collection of SeqGroups produced by running this grouper on the Seq entries from the DB.
* TODO: Implement this in a way that doesn't store the whole map in memory at the same time.
*
* @return The collection of produced SeqGroups.
*/
public ReactionGroupCorpus getReactionGroupCorpus() {
Map<String, ReactionGroup> sequenceToReactionGroupMap = getSequenceToReactionGroupMap(seqIterator);
LOGGER.info("Done getting seq group map, found %d distinct SeqGroups.", sequenceToReactionGroupMap.size());
return new ReactionGroupCorpus(sequenceToReactionGroupMap.values());
}
/**
* Iterates over seq entries and builds a map from unique sequences to ReactionGroup objects that list their
* corresponding Seq entry ids and Reaction ids.
*
* @param seqIterator
* @return
*/
private Map<String, ReactionGroup> getSequenceToReactionGroupMap(Iterator<Seq> seqIterator) {
Map<String, ReactionGroup> sequenceToReactionGroupMap = new HashMap<>();
Integer counter = 0;
while (seqIterator.hasNext()) {
if (counter >= limit) {
break;
}
if (counter % 1000 == 0) {
LOGGER.info("Processed %d seq entries so far", counter);
}
Seq seq = seqIterator.next();
String sequence = seq.getSequence();
ReactionGroup group = sequenceToReactionGroupMap.get(sequence);
if (group == null) {
group = new ReactionGroup("SEQ_ID_" + Integer.toString(seq.getUUID()), dbName);
sequenceToReactionGroupMap.put(sequence, group);
}
for (Long reactionId : seq.getReactionsCatalyzed()) {
group.addReactionId(reactionId);
}
counter++;
}
return sequenceToReactionGroupMap;
}
}