package converters.magetab; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.Map; import org.molgenis.core.Ontology; import org.molgenis.core.OntologyTerm; import org.molgenis.framework.db.Database; import org.molgenis.framework.db.DatabaseException; import org.molgenis.organization.Investigation; import org.molgenis.pheno.Individual; import org.molgenis.pheno.ObservationElement; import org.molgenis.pheno.ObservedValue; import org.molgenis.util.CsvFileReader; import org.molgenis.util.CsvReader; import org.molgenis.util.Tuple; import app.DatabaseFactory; /* * Changelog: * Treat Characteristic[Individual] as observable feature to make this generic not only for MOLPAGE but for all ArrayExpress. */ public class ConvertMageTabToPheno { public static void main(String[] args) throws FileNotFoundException, IOException, DatabaseException { // should become commandline parameter taking IDF first and loading that // in Investigation. File sdrfFile = new File("data/ArrayExpress/E-TABM-325.sdrf.txt"); final Database db = DatabaseFactory.create("molgenis.properties"); String INVESTIGATION_NAME = "MOLPAGE"; try { db.beginTx(); // Investigation final Investigation inv = new Investigation(); inv.setName(INVESTIGATION_NAME); db.add(inv); // add ontology term for species if needed // EFO,http://www.ebi.ac.uk/efo Ontology termSource = new Ontology(); termSource.setName("EFO"); termSource.setOntologyURI("http://www.ebi.ac.uk/efo"); // db.add(termSource); final OntologyTerm species = new OntologyTerm(); species.setName("human"); // species.setTermLabel(termSource.getName() + ":human"); species.setTermAccession("http://www.ebi.ac.uk/efo/EFO_0001994"); species.setOntology(termSource.getId()); db.add(species); // parsing CsvReader reader = new CsvFileReader(sdrfFile); reader.setSeparator('\t'); final Map<String, Individual> iMap = new LinkedHashMap<String, Individual>(); // load all features, optionally adding terms final Map<String, ObservationElement> fMap = new LinkedHashMap<String, ObservationElement>(); for (String annotation : reader.colnames().subList(1, reader.colnames().indexOf("Sample Name"))) { if (annotation.startsWith("Characteristics")) { String characteristic = annotation.substring(annotation.indexOf('[') + 1, annotation.indexOf(']')); // exclude 'Individual' // if (!characteristic.equalsIgnoreCase("Individual")) // ObservationElement f = new ObservationElement(); f.setInvestigation(inv.getId()); f.setName(characteristic); f.setInvestigation(inv.getId()); fMap.put(characteristic, f); // } } else if (annotation.equals("Term Source REF")) { // ah, it is a term too, dirty even } } for (ObservationElement f : fMap.values()) System.out.println(f); db.add(new ArrayList<ObservationElement>(fMap.values())); // add individuals at first parse for (Tuple line : reader) { // String individual = // line.getString("Characteristics [Individual]"); String individual = line.getString("Source Name"); if (!iMap.containsKey(individual)) { Individual i = new Individual(); i.setName(individual); i.setInvestigation(inv.getId()); // i.setSpecies(species.getId()); // need special characteristics like 'sex' // i.setSex("unknown"); iMap.put(individual, i); } } for (Individual i : iMap.values()) System.out.println(i); db.add(new ArrayList<Individual>(iMap.values())); // add the values, and optionally ontology references, by parsing // again final Map<String, ObservedValue> vMap = new LinkedHashMap<String, ObservedValue>(); final Map<String, OntologyTerm> tMap = new LinkedHashMap<String, OntologyTerm>(); final Map<String, Ontology> sMap = new LinkedHashMap<String, Ontology>(); reader.reset(); for (Tuple line : reader) { // String individualName = // line.getString("Characteristics [Individual]"); String individualName = line.getString("Source Name"); Individual i = iMap.get(individualName); if (i == null) throw new Exception("Source unknown: " + individualName); ObservedValue value = null; // add all characteristics properly,skip first column // describing source for (int j = 1; j < line.size(); j++) { String fieldName = line.getFields().get(j).trim(); // until if ("Sample Name".equals(fieldName)) break; // characteristic == ObservedValue if (line.getString(j) != null) { String fieldValue = line.getString(j).trim(); if (fieldName.startsWith("Characteristics")) { String characteristic = fieldName.substring(fieldName.indexOf('[') + 1, fieldName.indexOf(']')); // if // (!characteristic.equalsIgnoreCase("Individual")) // { ObservationElement f = fMap.get(characteristic); // exception for specific characteristics // 'sex', value = new ObservedValue(); value.setInvestigation(inv.getId()); value.setTarget(i.getId()); value.setFeature(f.getId()); value.setValue(fieldValue); // removes duplicates vMap.put(i.getName() + "_" + f.getName(), value); // } } else if (fieldName.equals("Term Source REF") && !fieldValue.equals("")) { Ontology ontology = sMap.get(fieldValue); if (ontology == null) { ontology = new Ontology(); ontology.setName(fieldValue); ontology.setOntologyAccession(fieldValue); System.out.println("adding source: " + ontology); db.add(ontology); sMap.put(fieldValue, ontology); } // see if characteristic is known as phenotype OntologyTerm term = tMap.get(ontology.getName() + "__" + value.getValue()); if (term == null) { term = new OntologyTerm(); term.setName(value.getValue()); term.setOntology(ontology.getId()); System.out.println("adding term: " + term); db.add(term); tMap.put(ontology.getName() + "__" + value.getValue(), term); } value.setOntologyReference(term.getId()); } } } } // for(ObservedValue v: vMap.values()) System.out.println(v); db.add(new ArrayList<ObservedValue>(vMap.values())); db.commitTx(); } catch (Exception e) { db.rollbackTx(); System.err.println(e.getMessage()); e.printStackTrace(); } } }