/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation.l2expansion; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.Function; import java.util.function.Predicate; /** * Represents the set of all predictions made by an L2 expansion run */ public class L2PredictionCorpus implements Serializable { private static final long serialVersionUID = 2502953593841339815L; /* TODO: add tests of serialization for this class an its neighbors. We should ensure we can successfully consume * prediction results w/ any class of SAR so we don't get stuck with results that are locked up in unreadable JSON. */ private static transient final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); static { OBJECT_MAPPER.enable(SerializationFeature.INDENT_OUTPUT); } @JsonProperty("corpus") private List<L2Prediction> corpus; @JsonIgnore private Map<Integer, L2Prediction> idToPredictionMap; public L2PredictionCorpus() { this(new ArrayList<>()); } public L2PredictionCorpus(List<L2Prediction> corpus) { this.corpus = corpus; populateIdToPredictionMap(); } public List<L2Prediction> getCorpus() { return corpus; } /** * Read a prediction corpus from file, and populate its prediction map. * * @param corpusFile The file to read. * @return The L2PredictionCorpus. * @throws IOException */ public static L2PredictionCorpus readPredictionsFromJsonFile(File corpusFile) throws IOException { return L2PredictionCorpus.OBJECT_MAPPER.readValue(corpusFile, L2PredictionCorpus.class).populateIdToPredictionMap(); } /** * Gets the prediction with the given ID from the prediction corpus. * * @param id The prediction ID to find. * @return The corresponding prediction. * @throws IllegalArgumentException if the id is not present in the prediction map. */ @JsonIgnore public L2Prediction getPredictionFromId(Integer id) { L2Prediction result = idToPredictionMap.get(id); if (result != null) { return result; } throw new IllegalArgumentException("Id " + id + " is not present in corpus, or the id->prediction map has not " + "been repopulated since it was added."); } /** * Add all prediction IDs to idToPredictionMap. This is called on construction and load from file, but should be * re-called after predictions are added to the corpus, if getPredictionFromId is to be used. */ public L2PredictionCorpus populateIdToPredictionMap() { this.idToPredictionMap = new HashMap<>(); corpus.forEach(prediction -> idToPredictionMap.put(prediction.getId(), prediction)); return this; } /** * Applies a transformation to this L2PredictionCorpus, which acts on each prediction in the corpus. * Returns a new corpus with the results; this corpus is not modified. * * @param transformation The transformation to apply. * @return The transformed corpus. */ public L2PredictionCorpus applyTransformation(Function<L2Prediction, L2Prediction> transformation) throws IOException { L2PredictionCorpus newCorpus = new L2PredictionCorpus(); for (L2Prediction prediction : getCorpus()) { newCorpus.addPrediction(transformation.apply(new L2Prediction(prediction))); } return newCorpus; } /** * Applies a filter to this L2PredictionCorpus, returning a new corpus with only those predictions that pass * the filter. This corpus is not modified, and the predictions in the new corpus are deep copies of the * predictions in the original corpus. * * @param filter The filter to be used. * @return The filtered corpus. */ public L2PredictionCorpus applyFilter(Predicate<L2Prediction> filter) throws IOException { L2PredictionCorpus newCorpus = new L2PredictionCorpus(); for (L2Prediction prediction : getCorpus()) { L2Prediction predictionCopy = new L2Prediction(prediction); if (filter.test(predictionCopy)) { newCorpus.addPrediction(predictionCopy); } } return newCorpus; } /** * Applies a function to each prediction in the corpus, and splits the corpus into one corpus for each distinct * output value of that function. For example, this could be used to split a corpus into one corpus per distinct * projector used to build it. * * @param classifier The function to apply to each element. * @return A map from values produced by the classifier, to the corresponding PredictionCorpus. */ public <T> Map<T, L2PredictionCorpus> splitCorpus(Function<L2Prediction, T> classifier) throws IOException { Map<T, L2PredictionCorpus> corpusMap = new HashMap<>(); for (L2Prediction prediction : getCorpus()) { L2Prediction predictionCopy = new L2Prediction(prediction); T key = classifier.apply(predictionCopy); L2PredictionCorpus corpus = corpusMap.get(key); if (corpus == null) { corpus = new L2PredictionCorpus(); corpusMap.put(key, corpus); } corpus.addPrediction(predictionCopy); } return corpusMap; } /** * Write the L2PredictionCorpus to file in json format. * * @param outputFile Where to write the file. * @throws IOException */ public void writePredictionsToJsonFile(File outputFile) throws IOException { try (BufferedWriter predictionWriter = new BufferedWriter(new FileWriter(outputFile))) { OBJECT_MAPPER.writeValue(predictionWriter, this); } } /** * Write the L2PredictionCorpus to file in list of inchis format. * * @param outputFile Where to write the file. * @throws IOException */ public void writePredictionsAsInchiList(File outputFile) throws IOException { try (BufferedWriter predictionWriter = new BufferedWriter(new FileWriter(outputFile))) { Set<String> productInchis = this.getUniqueProductInchis(); for (String inchi : productInchis) { predictionWriter.write(inchi); predictionWriter.newLine(); } } } /** * Get a list of all product inchis from corpus. */ @JsonIgnore public Set<String> getUniqueProductInchis() { Set<String> inchiSet = new HashSet<>(); for (L2Prediction prediction : getCorpus()) { inchiSet.addAll(prediction.getProductInchis()); } return inchiSet; } /** * Get a list of all substrate inchis from corpus. */ @JsonIgnore public Set<String> getUniqueSubstrateInchis() { Set<String> inchiSet = new HashSet<>(); for (L2Prediction prediction : getCorpus()) { inchiSet.addAll(prediction.getSubstrateInchis()); } return inchiSet; } public void addPrediction(L2Prediction prediction) { corpus.add(prediction); } public void addAll(Collection<L2Prediction> predictions) { for (L2Prediction prediction : predictions) { addPrediction(prediction); } } /** * Returns the count of the predictions matching some given predicate. * * @param predicate The predicate. * @return The number of matching predictions. */ public int countPredictions(Predicate<L2Prediction> predicate) { int count = 0; for (L2Prediction prediction : corpus) { if (predicate.test(prediction)) { count++; } } return count; } }