/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.metacyc; import act.server.MongoDB; import act.shared.Chemical; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class MetaCyc { public static final String METACYC_COMPOUND_FILE_NAME = "compounds.dat"; // map from location of biopax L3 file to the corresponding parsed organism model HashMap<String, OrganismComposition> organismModels; String sourceDir; private List<String> owlFiles; // if onlyTier12 is set, then only the 38 main files are processed // we identify them as not having names that contain one of // ("hmpcyc", "wgscyc", more than three successive digits) // See http://biocyc.org/biocyc-pgdb-list.shtml and the descriptions of Tier1 and Tier2 // Outside of these 38, there are 3487 Tier3 files that have not received manual // curation and are just the dump output of their PathLogic program. boolean onlyTier12; public MetaCyc(String dirWithL3Files) { this(dirWithL3Files, null); } public MetaCyc(String dirWithL3Files, List<String> owlFiles){ this.organismModels = new HashMap<String, OrganismComposition>(); this.sourceDir = dirWithL3Files; this.owlFiles = owlFiles; // by default, we process of level3 biopax files found in the directory // so we set the flag that restricts to Tier 1 and 2 as false. // Use loadOnlyTier12 if a restriction to those files is needed. this.onlyTier12 = false; } public void loadOnlyTier12(boolean flag) { this.onlyTier12 = flag; } // processes num files in source directory (num = -1 for all) public void process(int num) { if (num > 15) warnAboutMem(num); if (num > 0) process(0, num); // process only num files else process(getOWLs()); // process all files } public void process(int start, int end) { if (end-start > 50) warnAboutMem(end-start); List<String> files = getOWLs(); if (end > files.size()) { System.out.format("Chunk end index %d is out of bounds, limiting to %d\n", end, files.size()); end = files.size(); } files = files.subList(start, end); // only process a sublist from [start, end) process(files); } private void warnAboutMem(int num_asked) { System.out.println("You asked to process more than 15 files: " + num_asked); System.out.println("You can process about 10 files in 4GB of runtime memory"); } // process only the source file whose names are passed public void process(List<String> files) { for (String file : files) { final File INPUT_FILE = new File(file); System.out.format("Processing biopax file %s\n", INPUT_FILE.getAbsolutePath()); HashMap<String, String> uniqueKeyToInChIMap = generateUniqueKeyToInChIMapping(INPUT_FILE); System.out.println("Processing: " + INPUT_FILE.getAbsolutePath()); if (file.endsWith("leishcyc/biopax-level3.owl")) { System.out.println("Friendly reminder: Did you patch this leishcyc file with the " + "diff in src/main/resources/leishcyc.biopax-level3.owl.diff to take care of " + "the bad data in the original? If you are running over the plain downloaded file, " + "then this will crash."); } try (FileInputStream f = new FileInputStream(INPUT_FILE)) { // Construct the organism and read the owl file. OrganismComposition o = new OrganismComposition(uniqueKeyToInChIMap); new BioPaxFile(o).initFrom(f); this.organismModels.put(file, o); } catch(IOException e) { System.err.println("Error while handling file : " + INPUT_FILE.getAbsolutePath() + ". Aborting."); System.exit(-1); } } } private static final Pattern METACYC_COMPOUNDS_COMMENT_PATTERN = Pattern.compile("^\\s*#"); private static final Pattern METACYC_COMPOUNDS_ENTITY_END_PATTERN = Pattern.compile("^//$"); private static final Pattern METACYC_COMPOUNDS_FIELD_PATTERN = Pattern.compile("^([a-zA-Z_-]+)\\s+-\\s+(.*)$"); private static final Pattern METACYC_COMPOUNDS_STRING_CONTINUATION_PATTERN = Pattern.compile("^/(.*)$"); private static final String METACYC_COMPOUNDS_UNIQUE_ID_FIELD = "UNIQUE-ID"; private static final String METACYC_COMPOUNDS_INCHI_FIELD = "INCHI"; private static final String METACYC_COMPOUNDS_NON_STANDARD_INCHI_FIELD = "NON-STANDARD-INCHI"; private enum METACYC_COMPOUNDS_LAST_FIELD { UNIQUE_ID, INCHI, NON_STANDARD_INCHI, OTHER, } public HashMap<String, String> generateUniqueKeyToInChIMapping(File biopaxFilePath) { File compoundFile = new File(biopaxFilePath.getParentFile(), METACYC_COMPOUND_FILE_NAME); if (!compoundFile.exists()) { // TODO: should this throw an exception? System.err.format("ERROR: Missing " + METACYC_COMPOUND_FILE_NAME + " file for bioxpax file %s at %s\n", biopaxFilePath, compoundFile.getAbsolutePath()); return new HashMap<>(0); } HashMap<String, String> keyToInChIMap = new HashMap<>(); try ( BufferedReader reader = new BufferedReader(new FileReader(compoundFile)); ) { // Hacky stateful parser for Metacyc's compounds.dat files. // TODO: it'd be easier to read compound-links.dat, but those files don't always exist. Why not? METACYC_COMPOUNDS_LAST_FIELD lastField = METACYC_COMPOUNDS_LAST_FIELD.OTHER; String uniqueId = null; String inchi = null; String nonstandardInchi = null; int lineNum = 0; String line = null; while ((line = reader.readLine()) != null) { lineNum++; /* Note: this loop uses continues rather than if/else statements so that the field matcher can be created/saved * only as needed. */ // Skip blank lines and comments. if (line.isEmpty() || METACYC_COMPOUNDS_COMMENT_PATTERN.matcher(line).find()) { continue; } /* We're at the end of an entity in the compounds file. Store the unique id + inchi, or complain if they're * undefined at this point. */ if (METACYC_COMPOUNDS_ENTITY_END_PATTERN.matcher(line).matches()) { if (uniqueId == null || (inchi == null && nonstandardInchi == null)) { System.err.format( "ERROR: Found malformed entity line at %s L%d: unique-id='%s' inchi='%s' non-standard-inchi=%s\n", compoundFile.getAbsolutePath(), lineNum, uniqueId, inchi, nonstandardInchi); } else { /* The counts of InChIs and entities in some of the compoounds files don't line up. Note those for * further investigation. */ if (inchi == null && nonstandardInchi.startsWith("InChI=")) { System.err.format("WARNING: Found only non-standard inchi for %s: %s\n", uniqueId, nonstandardInchi); inchi = nonstandardInchi; } // Only store the InChI if it's from the INCHI field or looks like an InChI. if (inchi != null) { keyToInChIMap.put(uniqueId, inchi); } } // Reset for the next chemical element. uniqueId = null; inchi = null; nonstandardInchi = null; lastField = METACYC_COMPOUNDS_LAST_FIELD.OTHER; continue; } /* See if we've encountered a dash-delimited field line; store the line's contents if the field is useful. */ Matcher fieldMatcher = METACYC_COMPOUNDS_FIELD_PATTERN.matcher(line); if (fieldMatcher.matches()) { switch (fieldMatcher.group(1)) { case METACYC_COMPOUNDS_UNIQUE_ID_FIELD: if (uniqueId != null) { // We don't expect to see more than one id per compound, so warn if we find any. System.err.format( "ERROR: Found duplicate ID in %s at line %d: unique-id='%s' new-id='%s'\n", compoundFile.getAbsolutePath(), lineNum, uniqueId, fieldMatcher.group(2)); } uniqueId = fieldMatcher.group(2); lastField = METACYC_COMPOUNDS_LAST_FIELD.UNIQUE_ID; break; case METACYC_COMPOUNDS_INCHI_FIELD: //System.out.format("Matching on InchiField: %s\n", fieldMatcher.group(2)); if (inchi != null) { // We don't expect to see more than one InChI per compound, so warn if we find any. System.err.format( "ERROR: Found duplicate InChI in %s at line %d: unique-id='%s' inchi='%s' new-inchi='%s'\n", compoundFile.getAbsolutePath(), lineNum, uniqueId, inchi, fieldMatcher.group(2)); } inchi = fieldMatcher.group(2); lastField = METACYC_COMPOUNDS_LAST_FIELD.INCHI; break; case METACYC_COMPOUNDS_NON_STANDARD_INCHI_FIELD: // It isn't clear if/how these should be constrained, so just take the last one. nonstandardInchi = fieldMatcher.group(2); lastField = METACYC_COMPOUNDS_LAST_FIELD.NON_STANDARD_INCHI; break; default: // Ignore all other kinds of fields. lastField = METACYC_COMPOUNDS_LAST_FIELD.OTHER; break; } continue; } // Cautiously handle all string continuations, though we hope not to see them with ids/InChIs. Matcher stringContinuationMatcher = METACYC_COMPOUNDS_STRING_CONTINUATION_PATTERN.matcher(line); if (stringContinuationMatcher.matches()) { switch (lastField) { case UNIQUE_ID: uniqueId += stringContinuationMatcher.group(1); System.err.format("WARNING: found unexpected continued UNIQUE_ID: %s\n", uniqueId); break; case INCHI: inchi += stringContinuationMatcher.group(1); System.err.format("WARNING: found unexpected continued InChI: %s\n", inchi); break; case NON_STANDARD_INCHI: nonstandardInchi += stringContinuationMatcher.group(1); System.err.format("WARNING: found unexpected continued non-standard InChI: %s\n", nonstandardInchi); break; default: // No need to store fields other than the id/InChIs. break; } continue; } } } catch (IOException e) { // TODO: handle this better. System.err.format("Caught IO exception when reading file at %s: %s\n", compoundFile.getAbsoluteFile(), e.getMessage()); return new HashMap<>(0); } System.out.format("Loaded %d unique id -> InChI mappings from %s\n", keyToInChIMap.size(), compoundFile.getAbsolutePath()); return keyToInChIMap; } public OrganismComposition get(String file) { return this.organismModels.get(file); } public int getNumFilesToBeProcessed() { return getOWLs().size(); } static String[] tier12 = new String[] { "10403s_rastcyc", "agrocyc", "ano2cyc", "anthracyc", "aurantimonascyc", "bsubcyc", "cattlecyc", "caulocyc", "caulona1000cyc", "chlamycyc", "cparvumcyc", "ecol199310cyc", "ecol316407cyc", "ecol413997cyc", "ecoo157cyc", "flycyc", "hominiscyc", "hpycyc", "mob3bcyc", "mousecyc", "mtbrvcyc", "pbergheicyc", "pchabaudicyc", "pchrcyc", "plasmocyc", "pvivaxcyc", "pyoeliicyc", "scocyc", "shigellacyc", "smancyc", "synelcyc", "toxocyc", "trypanocyc", "vchocyc", // The following data directories only contain an ocelot file dump // which is a lisp format raw dump of the db in their own custom // format. (http://bioinformatics.ai.sri.com/ptools/flatfile-format.html) // It does not make sense for us to write a custom parser for these // three files "clossaccyc", // Clostridium saccharoperbutylacetonicum // http://biocyc.org/CLOSSAC/organism-summary?object=CLOSSAC "mtbcdc1551cyc", // Mycobacterium tuberculosis // http://biocyc.org/MTBCDC1551/organism-summary?object=MTBCDC1551 "thapscyc", // Thalassiosira pseudonana // http://biocyc.org/THAPS/organism-summary?object=THAPS // Cannot locate the data file corresponding to: Candida albicans, Strain SC5314 // http://biocyc.org/CALBI/organism-summary?object=CALBI // NCBI Taxonomy ID: 237561 // The above URL suggests it should be called calbicyc (this is how we derived // the names of all valid 37 files above), but we cannot find that dir "calbicyc", // Candida albicans // http://biocyc.org/CALBI/organism-summary?object=CALBI }; public List<String> getOWLs() { // If this was called previously we will have a list of all the cached files. if (owlFiles != null) return owlFiles; String dir = this.sourceDir; boolean onlyTier12Files = this.onlyTier12; final List<String> tier12files = Arrays.asList(tier12); FilenameFilter subdirfltr = new FilenameFilter() { public boolean accept(File dir, String sd) { if (!new File(dir, sd).isDirectory()) return false; if (onlyTier12Files) { // additional checks if only looking for tier1,2 files // Tier1,2 are the important ones because they are the // only ones that have received manual curation: // http://biocyc.org/biocyc-pgdb-list.shtml return tier12files.contains(sd); // -- The below is an old heuristic that eliminates 7 valid files. // -- Instead we do a direct check as above from a static list of filenames // -- // It is a Tier1,2 file if its name does not contain one of // -- // ("hmpcyc", "wgscyc", more than three successive digits) // -- if (sd.contains("hmpcyc") || sd.contains("wgscyc")) // -- return false; // -- if (sd.matches("^.*[0-9][0-9][0-9].*$")) // -- return false; } return true; } }; FilenameFilter owlfltr = new FilenameFilter() { public boolean accept(File dir, String nm) { return nm.endsWith("level3.owl"); } }; List<String> allL3 = new ArrayList<>(); for (String subdir : new File(dir).list(subdirfltr)) { File parent = new File(dir, subdir); for (String owlfile : parent.list(owlfltr)) { System.out.format(" Found OWL file: %s\n", new File(parent, owlfile).getAbsolutePath()); allL3.add(new File(parent, owlfile).getAbsolutePath()); } } Collections.sort(allL3); owlFiles = allL3; return allL3; } public void sendToDB(MongoDB db) { Chemical.REFS originDB = Chemical.REFS.METACYC; for (String oid : this.organismModels.keySet()) { OrganismCompositionMongoWriter owriter = new OrganismCompositionMongoWriter(db, this.organismModels.get(oid), oid, originDB); owriter.write(); } } }