package org.genedb.db.loading.auxiliary;
import org.gmod.schema.feature.Polypeptide;
import org.gmod.schema.feature.SignalPeptide;
import org.gmod.schema.mapped.CvTerm;
import org.gmod.schema.mapped.FeatureProp;
import org.gmod.schema.mapped.Analysis;
import org.apache.log4j.Logger;
import org.hibernate.Session;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SignalPLoader extends Loader {
private static final Logger logger = Logger.getLogger(SignalPLoader.class);
String analysisProgramVersion;
private Analysis analysis;
@Override
protected Set<String> getOptionNames() {
Set<String> options = new HashSet<String>();
Collections.addAll(options, "signalp-version");
return options;
}
@Override
protected boolean processOption(String optionName, String optionValue) {
if (optionName.equals("signalp-version")) {
analysisProgramVersion = optionValue;
return true;
}
return false;
}
@Override
public void doLoad(InputStream inputStream, Session session) throws IOException {
loadTerms();
if (analysisProgramVersion == null) {
throw new IllegalArgumentException("Property load.analysis.programVersion is required");
}
// Add analysis
analysis = new Analysis();
analysis.setProgram("signalp");
analysis.setProgramVersion(analysisProgramVersion);
sequenceDao.persist(analysis);
if(analysisProgramVersion.equals("3.0"))
{
SignalPFile file = new SignalPFile(inputStream);
int n=1;
for (SignalPHit hit: file.hits()) {
logger.info(String.format("[%d/%d] Processing prediction for '%s'", n++, file.hits().size(), hit.getKey()));
loadHit(hit);
if (n % 50 == 1) {
logger.info("Clearing session");
session.clear();
}
}
}
else
{
SignalPFileV4 file = new SignalPFileV4(inputStream);
int n=1;
for (SignalPHit hit: file.hits()) {
logger.info(String.format("[%d/%d] Processing prediction for '%s'", n++, file.hits().size(), hit.getKey()));
loadHit(hit);
if (n % 50 == 1) {
logger.info("Clearing session");
session.clear();
}
}
}
}
private CvTerm predictionTerm, peptideProbabilityTerm, anchorProbabilityTerm, plasmoAPScoreTerm;
private void loadTerms() {
predictionTerm = cvDao.getCvTermByNameAndCvName("SignalP_prediction", "genedb_misc");
peptideProbabilityTerm = cvDao.getCvTermByNameAndCvName("signal_peptide_probability", "genedb_misc");
anchorProbabilityTerm = cvDao.getCvTermByNameAndCvName("signal_anchor_probability", "genedb_misc");
plasmoAPScoreTerm = cvDao.getCvTermByNameAndCvName("PlasmoAP_score", "genedb_misc");
}
private void loadHit(SignalPHit hit) {
Polypeptide polypeptide = getPolypeptideByMangledName(hit.getKey());
logger.debug(String.format("Processing hit of type '%s'", hit.getType()));
if (polypeptide == null) {
logger.error(String.format("Could not find polypeptide for key '%s'", hit.getKey()));
return;
}
sequenceDao.persist(new FeatureProp(polypeptide, predictionTerm, hit.getType(), 0));
sequenceDao.persist(new FeatureProp(polypeptide, peptideProbabilityTerm, hit.getPeptideProbability(), 0));
if (hit.getAnchorProbability() != null) {
sequenceDao.persist(new FeatureProp(polypeptide, anchorProbabilityTerm, hit.getAnchorProbability(), 0));
}
if (hit.getType().equals("Signal peptide")) {
SignalPeptide signalPeptide = sequenceDao.createSignalPeptide(polypeptide, hit.getCleavageSiteAfter(),
hit.getCleavageSiteProbability(), analysis);
sequenceDao.persist(signalPeptide);
}
/* Add the plasmoAP score (if available) */
if (hit.getPlasmoAP_score()!=null){
sequenceDao.persist(new FeatureProp(polypeptide, plasmoAPScoreTerm, hit.getPlasmoAP_score(), 0));
}
}
}
class SignalPFileV4 {
private static final Logger logger = Logger.getLogger(SignalPFileV4.class);
private List<SignalPHit> hits = new ArrayList<SignalPHit>();
public SignalPFileV4(InputStream inputStream) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
while (null != (line = reader.readLine())) {
if (line.startsWith("#") || line.equals("")) {
}
else
{
logger.trace(line);
parseSummary(line);
}
}
}
public Collection<SignalPHit> hits() {
return hits;
}
// append optional ap score to the end of the results
private static final Pattern SUMMARY_PATTERN = Pattern.compile(
"([^\\s\\t]+)[\\s\\t]+(\\d\\.\\d{3})[\\s\\t]+\\d+[\\s\\t]+(\\d\\.\\d{3})[\\s\\t]+(\\d+)[\\s\\t]+(\\d\\.\\d{3})[\\s\\t]+\\d+[\\s\\t]+\\d\\.\\d{3}[\\s\\t]+\\d\\.\\d{3}[\\s\\t]+(Y|N)[\\s\\t]+\\d\\.\\d{3}[\\s\\t]+(.*)"
);
private void parseSummary(CharSequence summary) {
Matcher matcher = SUMMARY_PATTERN.matcher(summary);
if (matcher.matches()) {
String key = matcher.group(1);
String type = "";
if(matcher.group(6).equals("Y"))
{
type = "Signal peptide";
}
else
{
return;
}
String peptideProbability = matcher.group(5);
String anchorProbability = matcher.group(3);
String cleavageSiteProbability = matcher.group(2);
int cleavageSiteAfter = Integer.parseInt(matcher.group(4));
hits.add(new SignalPHit(key, type, peptideProbability, anchorProbability, cleavageSiteProbability, null,
cleavageSiteAfter));
}
else {
logger.error("Failed to parse summary:\n" + summary);
}
}
}
class SignalPFile {
private static final Logger logger = Logger.getLogger(SignalPFile.class);
private List<SignalPHit> hits = new ArrayList<SignalPHit>();
public SignalPFile(InputStream inputStream) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String previousLine = null, line;
while (null != (line = reader.readLine())) {
if (line.startsWith("Prediction: ")) {
if (previousLine == null) {
throw new IllegalStateException();
}
StringBuilder sb = new StringBuilder(previousLine);
sb.append('\n');
sb.append(line);
sb.append('\n');
while (0 < (line = reader.readLine()).length()) {
sb.append(line);
sb.append('\n');
}
logger.trace(sb);
parseSummary(sb);
}
previousLine = line;
}
}
public Collection<SignalPHit> hits() {
return hits;
}
private static final Pattern SUMMARY_PATTERN = Pattern.compile(
">(.*)\n"+
"Prediction: (Non-secretory protein|Signal peptide|Signal anchor)\n"+
"Signal peptide probability: (\\d\\.\\d{3})\n"+
"(?:Signal anchor probability: (\\d\\.\\d{3})\n)?"+
"Max cleavage site probability: (\\d\\.\\d{3}) between pos\\. (-1|\\d+) and ?(\\d+)\n" +
"(?:PlasmoAP_score:\\s+(\\d+)\n)?"
);
private void parseSummary(CharSequence summary) {
Matcher matcher = SUMMARY_PATTERN.matcher(summary);
if (matcher.matches()) {
String key = matcher.group(1);
String type = matcher.group(2);
if (type.equals("Non-secretory protein")) {
return;
}
String peptideProbability = matcher.group(3);
String anchorProbability = matcher.group(4);
String cleavageSiteProbability = matcher.group(5);
int cleavageSiteAfter = Integer.parseInt(matcher.group(7));
/* PlasmoAP score will be available if signalp was run with the -s option */
String plasmoAP_score = matcher.group(8);
hits.add(new SignalPHit(key, type, peptideProbability, anchorProbability, cleavageSiteProbability, plasmoAP_score,
cleavageSiteAfter));
}
else {
logger.error("Failed to parse summary:\n" + summary);
}
}
}
class SignalPHit {
private String key, type, peptideProbability, anchorProbability, cleavageSiteProbability, plasmoAP_score;
int cleavageSiteAfter;
public SignalPHit(String key, String type,
String peptideProbability, String anchorProbability,
String cleavageSiteProbability, String plasmoAP_score, int cleavageSiteAfter) {
this.key = key;
this.type = type;
this.peptideProbability = peptideProbability;
this.anchorProbability = anchorProbability;
this.cleavageSiteProbability = cleavageSiteProbability;
this.cleavageSiteAfter = cleavageSiteAfter;
this.plasmoAP_score = plasmoAP_score;
}
public String getKey() {
return key;
}
public String getType() {
return type;
}
public String getPeptideProbability() {
return peptideProbability;
}
public String getAnchorProbability() {
return anchorProbability;
}
public String getCleavageSiteProbability() {
return cleavageSiteProbability;
}
public int getCleavageSiteAfter() {
return cleavageSiteAfter;
}
public String getPlasmoAP_score(){
return plasmoAP_score;
}
}