/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.l2expansion;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
/**
* Represents the set of all predictions made by an L2 expansion run
*/
public class L2PredictionCorpus implements Serializable {
private static final long serialVersionUID = 2502953593841339815L;
/* TODO: add tests of serialization for this class an its neighbors. We should ensure we can successfully consume
* prediction results w/ any class of SAR so we don't get stuck with results that are locked up in unreadable JSON. */
private static transient final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
static {
OBJECT_MAPPER.enable(SerializationFeature.INDENT_OUTPUT);
}
@JsonProperty("corpus")
private List<L2Prediction> corpus;
@JsonIgnore
private Map<Integer, L2Prediction> idToPredictionMap;
public L2PredictionCorpus() {
this(new ArrayList<>());
}
public L2PredictionCorpus(List<L2Prediction> corpus) {
this.corpus = corpus;
populateIdToPredictionMap();
}
public List<L2Prediction> getCorpus() {
return corpus;
}
/**
* Read a prediction corpus from file, and populate its prediction map.
*
* @param corpusFile The file to read.
* @return The L2PredictionCorpus.
* @throws IOException
*/
public static L2PredictionCorpus readPredictionsFromJsonFile(File corpusFile) throws IOException {
return L2PredictionCorpus.OBJECT_MAPPER.readValue(corpusFile, L2PredictionCorpus.class).populateIdToPredictionMap();
}
/**
* Gets the prediction with the given ID from the prediction corpus.
*
* @param id The prediction ID to find.
* @return The corresponding prediction.
* @throws IllegalArgumentException if the id is not present in the prediction map.
*/
@JsonIgnore
public L2Prediction getPredictionFromId(Integer id) {
L2Prediction result = idToPredictionMap.get(id);
if (result != null) {
return result;
}
throw new IllegalArgumentException("Id " + id + " is not present in corpus, or the id->prediction map has not " +
"been repopulated since it was added.");
}
/**
* Add all prediction IDs to idToPredictionMap. This is called on construction and load from file, but should be
* re-called after predictions are added to the corpus, if getPredictionFromId is to be used.
*/
public L2PredictionCorpus populateIdToPredictionMap() {
this.idToPredictionMap = new HashMap<>();
corpus.forEach(prediction -> idToPredictionMap.put(prediction.getId(), prediction));
return this;
}
/**
* Applies a transformation to this L2PredictionCorpus, which acts on each prediction in the corpus.
* Returns a new corpus with the results; this corpus is not modified.
*
* @param transformation The transformation to apply.
* @return The transformed corpus.
*/
public L2PredictionCorpus applyTransformation(Function<L2Prediction, L2Prediction> transformation) throws IOException {
L2PredictionCorpus newCorpus = new L2PredictionCorpus();
for (L2Prediction prediction : getCorpus()) {
newCorpus.addPrediction(transformation.apply(new L2Prediction(prediction)));
}
return newCorpus;
}
/**
* Applies a filter to this L2PredictionCorpus, returning a new corpus with only those predictions that pass
* the filter. This corpus is not modified, and the predictions in the new corpus are deep copies of the
* predictions in the original corpus.
*
* @param filter The filter to be used.
* @return The filtered corpus.
*/
public L2PredictionCorpus applyFilter(Predicate<L2Prediction> filter) throws IOException {
L2PredictionCorpus newCorpus = new L2PredictionCorpus();
for (L2Prediction prediction : getCorpus()) {
L2Prediction predictionCopy = new L2Prediction(prediction);
if (filter.test(predictionCopy)) {
newCorpus.addPrediction(predictionCopy);
}
}
return newCorpus;
}
/**
* Applies a function to each prediction in the corpus, and splits the corpus into one corpus for each distinct
* output value of that function. For example, this could be used to split a corpus into one corpus per distinct
* projector used to build it.
*
* @param classifier The function to apply to each element.
* @return A map from values produced by the classifier, to the corresponding PredictionCorpus.
*/
public <T> Map<T, L2PredictionCorpus> splitCorpus(Function<L2Prediction, T> classifier) throws IOException {
Map<T, L2PredictionCorpus> corpusMap = new HashMap<>();
for (L2Prediction prediction : getCorpus()) {
L2Prediction predictionCopy = new L2Prediction(prediction);
T key = classifier.apply(predictionCopy);
L2PredictionCorpus corpus = corpusMap.get(key);
if (corpus == null) {
corpus = new L2PredictionCorpus();
corpusMap.put(key, corpus);
}
corpus.addPrediction(predictionCopy);
}
return corpusMap;
}
/**
* Write the L2PredictionCorpus to file in json format.
*
* @param outputFile Where to write the file.
* @throws IOException
*/
public void writePredictionsToJsonFile(File outputFile) throws IOException {
try (BufferedWriter predictionWriter = new BufferedWriter(new FileWriter(outputFile))) {
OBJECT_MAPPER.writeValue(predictionWriter, this);
}
}
/**
* Write the L2PredictionCorpus to file in list of inchis format.
*
* @param outputFile Where to write the file.
* @throws IOException
*/
public void writePredictionsAsInchiList(File outputFile) throws IOException {
try (BufferedWriter predictionWriter = new BufferedWriter(new FileWriter(outputFile))) {
Set<String> productInchis = this.getUniqueProductInchis();
for (String inchi : productInchis) {
predictionWriter.write(inchi);
predictionWriter.newLine();
}
}
}
/**
* Get a list of all product inchis from corpus.
*/
@JsonIgnore
public Set<String> getUniqueProductInchis() {
Set<String> inchiSet = new HashSet<>();
for (L2Prediction prediction : getCorpus()) {
inchiSet.addAll(prediction.getProductInchis());
}
return inchiSet;
}
/**
* Get a list of all substrate inchis from corpus.
*/
@JsonIgnore
public Set<String> getUniqueSubstrateInchis() {
Set<String> inchiSet = new HashSet<>();
for (L2Prediction prediction : getCorpus()) {
inchiSet.addAll(prediction.getSubstrateInchis());
}
return inchiSet;
}
public void addPrediction(L2Prediction prediction) {
corpus.add(prediction);
}
public void addAll(Collection<L2Prediction> predictions) {
for (L2Prediction prediction : predictions) {
addPrediction(prediction);
}
}
/**
* Returns the count of the predictions matching some given predicate.
*
* @param predicate The predicate.
* @return The number of matching predictions.
*/
public int countPredictions(Predicate<L2Prediction> predicate) {
int count = 0;
for (L2Prediction prediction : corpus) {
if (predicate.test(prediction)) {
count++;
}
}
return count;
}
}