package org.genedb.db.loading; import org.apache.log4j.Logger; import org.springframework.jdbc.core.simple.SimpleJdbcTemplate; import org.springframework.transaction.annotation.Transactional; import org.springframework.context.ApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Reader; import java.sql.SQLException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; /** * Load SNP data from a simple space-separated file. * The first line contains meta data about the analysis seperated by '##' * eg ANALYSIS_NAME=ssaha##ANALYSIS_PARAMS= -blah -blah##ANALYSIS_TIMESTAMP=##ANALYSIS_PROGRAM_VERSION=1.0##CHADO_ORGANISM_ID=91 * * The second line is the list of headers for the data that is to follow seperated by spaces. * eg ssaha_name_of_chromosome_reference ssaha_overall_confidence_score * * The rest of the lines contain data seperated by spaces * * In order to load the SNP files specify the following properties: * <dl> * <dt><code>load.inputDirectory</code></dt><dd>The path of the directory that contains valid SNP files, e.g. "/Users/rn2/variationDB" (Required)</dd> * <dt><code>load.fileNamePattern</code></dt><dd>The name of the file / pattern (default is *.snp) eg: ssaha.snp</dd> * * @author rn2 * */ public class LoadSNPs extends FileProcessor { private static final Logger logger = Logger.getLogger(LoadSNPs.class); private static String dbSchema; public static void main(String[] args) throws MissingPropertyException, IOException, ParsingException, SQLException { if (args.length > 0) { logger.warn("Ignoring command-line arguments"); } String inputDirectory = getRequiredProperty("load.inputDirectory"); String fileNamePattern = getPropertyWithDefault("load.fileNamePattern", ".*\\.snp"); dbSchema=getRequiredProperty("load.dbSchema"); logger.debug("I="+inputDirectory+"fp="+fileNamePattern+"dbSchema="+dbSchema); LoadSNPs loadSNPs = new LoadSNPs(); loadSNPs.processFileOrDirectory(inputDirectory, fileNamePattern); } private final SNPsLoader loader; private LoadSNPs() { ApplicationContext applicationContext = new ClassPathXmlApplicationContext(new String[] {"Load.xml"}); this.loader = applicationContext.getBean("snpsLoader", SNPsLoader.class); } @Override protected void processFile(File inputFile, Reader reader) throws IOException, ParsingException { SNPFile snpFile = new SNPFile(inputFile, reader); loader.load(snpFile,dbSchema); } } class SNPFile { // MetaDataLine is the list of name=value pairs seperated by '##' // ANALYSIS_NAME=MAQ##ANALYSIS_PARAMS= -blah -blah#ANALYSIS_TIMESTAMP=1010101010##FIELD_SEPERATOR="\t" class MetaDataLine { int numberOfFields; private String analysisName; private String analysisParams; private String analysisTimestamp; private String analysisProgramVersion; private int chado_organism_id; private String strain_id; private String strain_sample_id; private MetaDataLine(String line) throws ParsingException { String[] namevalpairsStr = line.split("##"); numberOfFields=namevalpairsStr.length; if (numberOfFields != 7) { throw new SyntaxError(file, 1, String.format("Wrong number of MetaData , expected 7 found %d", numberOfFields)); } for ( String nameval : namevalpairsStr){ String[] pairs=nameval.split("="); if(pairs[0].contains("ANALYSIS_NAME")) analysisName=pairs[1]; if(pairs[0].contains("ANALYSIS_PARAMS")) analysisParams=pairs[1]; if(pairs[0].contains("ANALYSIS_TIMESTAMP")) analysisTimestamp=(pairs.length==2)?pairs[1]:""; if(pairs[0].contains("ANALYSIS_PROGRAM_VERSION")) analysisProgramVersion=pairs[1]; if(pairs[0].contains("CHADO_ORGANISM_ID")) chado_organism_id=Integer.parseInt(pairs[1]); if(pairs[0].contains("STRAIN_ID")) strain_id=pairs[1]; if(pairs[0].contains("STRAIN_SAMPLE_ID")) strain_sample_id=pairs[1]; } } String getAnalysisName() { return analysisName; } String getAnalysisParams() { return analysisParams; } String getAnalysisTimestamp() { return analysisTimestamp; } String getAnalysisProgramVersion() { return analysisProgramVersion; } int get_Chado_Organism_ID() { return chado_organism_id; } String get_Strain_ID() { return strain_id; } String get_Strain_Sample_ID() { return strain_sample_id; } } class Line { int lineNumber; int numberOfFields; String fields[]; private Line(int lineNumber, String line) throws ParsingException { this.lineNumber = lineNumber; fields = line.split("\\s+"); // "\\t" } int getNumberOfFields(){ return numberOfFields; } String[] getFields(){ return fields; } } private File file; private MetaDataLine metaline; private Line headerline; private List<Line> lines = new ArrayList<Line>(); public SNPFile(File file, Reader reader) throws IOException, ParsingException { this.file = file; BufferedReader br = new BufferedReader(reader); String line; int lineNumber = 0; // First line is the MetaDataLine line = br.readLine(); metaline = new MetaDataLine(line); // Second line is the Headers line = br.readLine(); headerline=new Line(0,line); while (null != (line = br.readLine())) { lineNumber ++; lines.add(new Line(lineNumber, line)); } } public File file() { return file; } public MetaDataLine metaline() { return metaline; } public List<Line> datalines() { return lines; } public Line headerline() { return headerline; } public int numberOfDataLines() { return lines.size(); } } @Transactional(rollbackFor=DataError.class) // Will also rollback for runtime exceptions, by default class SNPsLoader { private static final Logger logger = Logger.getLogger(SNPsLoader.class); private SimpleJdbcTemplate simpleJdbcTemplate; public void load(SNPFile snpFile,String dbSchema) throws DataError { // Insert into dbSchema.Analysis table Date analysistimestamp; logger.info(String.format("ANALYSIS_NAME ->%s, ANALYSIS_PARAMS ->%s, ANALYSIS_TIMESTAMP ->%s, STRAIN_ID ->%s,STRAIN_SAMPLE_ID ->%s,NO OF DATA LINES ->%d",snpFile.metaline().getAnalysisName(),snpFile.metaline().getAnalysisParams(),snpFile.metaline().getAnalysisTimestamp(),snpFile.metaline().get_Strain_ID(),snpFile.metaline().get_Strain_Sample_ID(),snpFile.numberOfDataLines())); SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:mm:ss"); try{ if(snpFile.metaline().getAnalysisTimestamp().length()==0){ analysistimestamp = new Date(); } else{ analysistimestamp = df.parse(snpFile.metaline().getAnalysisTimestamp()); } }catch(java.text.ParseException e){ String errorMessage = String.format("ANALYSIS_TIMESTAMP %s not in proper format yyyy-MM-dd-HH-mm-ss",snpFile.metaline().getAnalysisTimestamp()); throw new DataError(errorMessage); } int n=simpleJdbcTemplate.update( "insert into "+dbSchema+".Analysis("+ "program,programversion,timeexecuted"+ ") values ("+ "?,?,?"+ ")", snpFile.metaline().getAnalysisName(), snpFile.metaline().getAnalysisProgramVersion(),analysistimestamp); if (n != 1) { throw new DataError(String.format("Unable to insert Analysis '%s'", snpFile.metaline().getAnalysisName())); } long analysisId = (Long) simpleJdbcTemplate.queryForMap( "select currval('"+dbSchema+".analysis_analysis_id_seq'::regclass) as analysis_id") .get("analysis_id"); // Now insert into dbSchema.AnalysisProp int cvId = (Integer) simpleJdbcTemplate.queryForMap( "select cv_id from cv where name='genedb_misc'") .get("cv_id"); List<Object[]> batch = new ArrayList<Object[]>(); Object[] commandlineprop = new Object[] {analysisId,"commandline_str",cvId,snpFile.metaline().getAnalysisParams()}; batch.add(commandlineprop); Object[] strain_id_prop = new Object[] {analysisId,"strain_id",cvId,snpFile.metaline().get_Strain_ID()}; batch.add(strain_id_prop); Object[] strain_sample_id_prop = new Object[] {analysisId,"strain_sample_id",cvId,snpFile.metaline().get_Strain_Sample_ID()}; batch.add(strain_sample_id_prop); simpleJdbcTemplate.batchUpdate( "insert into "+dbSchema+".AnalysisProp("+ " analysis_id, type_id, value"+ ") values ("+ "?,"+ "(select cvterm_id as type_id from cvterm where name =? and cv_id=?),"+ "?"+ ")" ,batch); // first line is the header and the rest are all data SNPFile.Line headers = snpFile.headerline(); String headersStr = new String(); ArrayList <String> headerList = new ArrayList <String>(); for ( String header : headers.getFields()){ headersStr+=","+header; headerList.add(header); } logger.info(String.format("HEADERS->%s",headersStr)); // process data // each line is a SNP so create a feature record for every line // each field within a line is featureprop of the SNP (feature) for (SNPFile.Line line: snpFile.datalines()) { // Insert in to feature table String snpFeatureUniqueName = "SNP_"+snpFile.metaline().get_Strain_ID()+"_"+snpFile.metaline().get_Strain_Sample_ID()+"_"+analysisId+"_"+line.lineNumber; n = simpleJdbcTemplate.update( "insert into "+dbSchema+".Feature("+ " organism_id,uniquename,type_id,is_analysis,is_obsolete,timeaccessioned,timelastmodified"+ ") values ("+ "?,?,(select cvterm_id as type_id from cvterm where name='SNP'),true,false,?,?"+ ")" ,snpFile.metaline().get_Chado_Organism_ID(),snpFeatureUniqueName,new java.sql.Timestamp(analysistimestamp.getTime()),new java.sql.Timestamp(analysistimestamp.getTime())); if (n != 1) { throw new DataError(String.format("Unable to insert SNP '%s'",snpFeatureUniqueName)); } long featureId = (Long) simpleJdbcTemplate.queryForMap( "select currval('"+dbSchema+".feature_feature_id_seq'::regclass) as feature_id") .get("feature_id"); // Insert in to featureloc String chromosomeReference=line.getFields()[0]; String offset=line.getFields()[1]; // Convert offset-->interbase coordinates Integer fmin = Integer.parseInt(offset); fmin = fmin-1; Integer fmax = fmin+1; n = simpleJdbcTemplate.update( "insert into "+dbSchema+".FeatureLoc("+ "feature_id,srcfeature_id,fmin,is_fmin_partial,fmax,is_fmax_partial,strand,phase,locgroup,rank"+ ") values ("+ "?,(select feature_id from feature where uniquename=?),?,?,?,?,?,?,?,?"+ ")" ,featureId,chromosomeReference,fmin,false,fmax,false,0,0,0,0); if (n != 1) { throw new DataError(String.format("Unable to insert featureLoc for SNP '%s'",snpFeatureUniqueName)); } String dataStr = new String(); Iterator <String> headerIterator = headerList.iterator(); for ( String datafield : line.getFields()){ String headerfield=headerIterator.next(); dataStr+="("+headerfield+","+datafield+")"; n = simpleJdbcTemplate.update( "insert into "+dbSchema+".FeatureProp("+ "feature_id,type_id,value,rank"+ ") values ("+ "?,(select cvterm_id from cvterm where name=? and cv_id=?),?,?"+ ")" ,featureId,headerfield,cvId,datafield,0); if (n != 1) { throw new DataError(String.format("Unable to insert featureprop for SNP '%s'",snpFeatureUniqueName)); } } // now associate this feature with the Analysis using AnalysisFeature record n = simpleJdbcTemplate.update( "insert into "+dbSchema+".AnalysisFeature("+ "feature_id,analysis_id"+ ") values ("+ "?,?"+ ")" ,featureId,analysisId); if (n != 1) { throw new DataError(String.format("Unable to insert record into AnalysisFeature for SNP '%s'",snpFeatureUniqueName)); } logger.trace(String.format("line %d DATA->%s",line.lineNumber,dataStr)); } } /* Spring setters */ public void setsimpleJdbcTemplate(SimpleJdbcTemplate simpleJdbcTemplate) { this.simpleJdbcTemplate = simpleJdbcTemplate; } }