// BridgeDb, // An abstraction layer for identifier mapping services, both local and online. // Copyright 2006-2009 BridgeDb developers // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // package org.bridgedb.tools.batchmapper; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.PrintStream; import java.io.PrintWriter; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import org.bridgedb.BridgeDb; import org.bridgedb.IDMapperException; import org.bridgedb.DataSource; import org.bridgedb.IDMapperStack; import org.bridgedb.Xref; import org.bridgedb.bio.BioDataSource; public class BatchMapper { private static class Settings { File fInput = null; File fOutput = null; File fReport = null; List<String> connectStrings = new ArrayList<String>(); DataSource is = null; DataSource os = null; int inputColumn = 0; int verbose = 0; // 0, 1 or 2 int mode = 0; // 0 or 1 int multiMap = 0; // 0 or 1 } public static void main(String[] args) { BatchMapper mapper = new BatchMapper(); mapper.run(args); } public void printUsage() { String version = ""; try { Properties props = new Properties(); props.load (BridgeDb.class.getResourceAsStream("BridgeDb.properties")); version = props.getProperty("bridgedb.version") + " (r" + props.getProperty("REVISION") + ")"; } catch (IOException ex) { version = ex.getMessage(); } System.out.println ("BatchMapper version " + version); System.out.print ( "BatchMapper is a tool for mapping biological identifiers.\n" + "Usage:\n"+ " batchmapper -ls \n" + " List system codes \n" + " or\n" + " batchmapper \n" + " [-v|-vv] \n" + " [-mm] \n" + " [-g <gene database>] \n " + " [-t <biomart text file>] \n " + " [-i <input file>] \n" + " -is <input system code or datasource name> \n" + " -os <output system code or datasource name> \n" + " [-o <output file>] \n" + " [-c <input column, 0-based>]\n" + " [-r <report file>] \n" + "\n" + "You should specify at least one -g or -t option.\n" + "Multiple -g or -t options will be combined transitively.\n"); } private DataSource dsFromArg(String arg) { for (DataSource ds : DataSource.getDataSources()) { if (arg.equals (ds.getSystemCode()) || arg.equals(ds.getFullName())) { return ds; } } System.out.println ("WARNING: " + arg + " is not a standard system code or DataSource name"); return DataSource.getByFullName(arg); } public String parseArgs(Settings settings, String[] args) { int pos = 0; while (pos < args.length) { if (args[pos].equals ("-ls")) { settings.mode = 1; } else if (args[pos].equals ("-v")) { settings.verbose = 1; } else if (args[pos].equals("-vv")) { settings.verbose = 2; } else if (args[pos].equals("-g")) { pos++; if (pos > args.length) return "File expected after -g"; File f = new File (args[pos]); if (!f.exists()) return "File " + args[pos] + " does not exist"; settings.connectStrings.add ("idmapper-pgdb:" + f.getAbsolutePath()); } else if (args[pos].equals("-t")) { pos++; if (pos > args.length) return "File expected after -t"; File f = new File (args[pos]); if (!f.exists()) return "File " + args[pos] + " does not exist"; try { settings.connectStrings.add ("idmapper-text:" + f.toURL()); } catch (MalformedURLException ex) { return ex.getMessage(); } } else if (args[pos].equals("-i")) { pos++; if (pos > args.length) return "File expected after -i"; settings.fInput = new File (args[pos]); if (!settings.fInput.exists()) return "File " + args[pos] + " does not exist"; } else if (args[pos].equals("-r")) { pos++; if (pos > args.length) return "File expected after -r"; settings.fReport = new File (args[pos]); } else if (args[pos].equals("-c")) { pos++; try { settings.inputColumn = Integer.parseInt (args[pos]); } catch (NumberFormatException ex) { return ex.getMessage(); } } else if (args[pos].equals("-o")) { pos++; if (pos > args.length) return "File expected after -o"; settings.fOutput = new File (args[pos]); } else if (args[pos].equals("-is")) { pos++; if (pos > args.length) return "System code expected after -is"; settings.is = dsFromArg(args[pos]); } else if (args[pos].equals("-os")) { pos++; if (pos > args.length) return "System code expected after -os"; settings.os = dsFromArg(args[pos]); } else if (args[pos].equals("-mm")) { settings.multiMap = 1; } else { return "Unrecognized option " + args[pos]; } pos++; } if (settings.mode == 1) { if (settings.is != null || settings.os != null || settings.connectStrings.size() > 0 || settings.fInput != null || settings.fOutput != null || settings.inputColumn != 0 || settings.multiMap != 0 || settings.fReport != null) { return "-ls option can't be combined with -g, -t, -i, -is, -os, -o, -mm or -r options"; } } else { if (settings.connectStrings.size() == 0) return "Missing -t or -g options"; if (settings.is == null) return "Missing -is option"; if (settings.os == null) return "Missing -os option"; } return null; } public static class Mapper { private List<String> connections = null; private File fInput = null; private File fOutput = null; private File fReport = null; private DataSource is = null; private DataSource os = null; private int inputColumn = 0; private int verbose = 0; // 0, 1 or 2 private int multiMap = 0; // 0 or 1 PrintStream report = System.out; private IDMapperStack gdb; private List<Xref> missing = new ArrayList<Xref>(); private List<Xref> ambiguous = new ArrayList<Xref>(); int totalLines = 0; int okLines = 0; public Mapper(List<String> connections, File fInput, File fOutput, File fReport, DataSource is, DataSource os, int inputColumn, int verbose, int multiMap) { this.connections = connections; this.fInput = fInput; this.fOutput = fOutput; this.fReport = fReport; this.is = is; this.os = os; this.inputColumn = inputColumn; this.verbose = verbose; this.multiMap = multiMap; } private void connectGdb() throws IDMapperException { gdb = new IDMapperStack(); for (String connectionString : connections) { gdb.addIDMapper(connectionString); } gdb.setTransitive(true); } public void writeMapping() throws IOException, IDMapperException { LineNumberReader reader; PrintWriter writer; if (fInput != null) { reader = new LineNumberReader(new FileReader (fInput)); } else { reader = new LineNumberReader(new InputStreamReader(System.in)); } String line; if (fOutput != null) { writer = new PrintWriter (new FileWriter (fOutput)); } else { writer = new PrintWriter (System.out); } while ((line = reader.readLine()) != null) { String[] fields = line.split("\t"); if (fields.length > inputColumn && fields[inputColumn] != null) { Xref srcRef = new Xref(fields[inputColumn], is); Set<Xref> srcSet = new HashSet<Xref>(); srcSet.add(srcRef); Map<Xref, Set<Xref>> mapresult = gdb.mapID(srcSet, os); Set<Xref> destRefs = mapresult.get (srcRef); if (destRefs == null || destRefs.size() == 0) { missing.add (srcRef); } else if (destRefs.size() >= 2) { ambiguous.add (srcRef); } if (destRefs != null && destRefs.size() > 0) { okLines++; if (multiMap == 0) { // use first one writer.print(destRefs.toArray(new Xref[0])[0].getId()); } else { // concatenate all, with " /// " as separator boolean first = true; for (Xref ref : destRefs) { if (first) { first = false; } else { writer.print (" /// "); } writer.print(ref.getId()); } } } totalLines++; } writer.println("\t" + line); } reader.close(); writer.close(); } public void reportMapping() { report.println ("Missing : " + missing.size()); report.println ("Ambiguous : " + ambiguous.size()); report.println ("Ok : " + okLines); report.println (" _______ +"); report.println ("Total : " + totalLines); report.println(); if (verbose >= 1) { // missing id's report.println ("Missing id's:"); for (int i = 0; i < missing.size(); ++i) { report.print (missing.get(i)); if (i < missing.size()-1) report.print (", "); if (i % 5 == 4) report.println(); } report.println(); // ambiguous id's report.println ("Ambiguous id's:"); for (int i = 0; i < ambiguous.size(); ++i) { report.print (ambiguous.get(i)); if (i < ambiguous.size()-1) report.print (", "); if (i % 5 == 4) report.println(); } report.println(); } } public void run() { try { if (fReport != null) { report = new PrintStream(new FileOutputStream(fReport)); } connectGdb(); writeMapping(); reportMapping(); if (fReport != null) { report.close(); } } catch (IOException ex) { ex.printStackTrace(); } catch (IDMapperException ex) { ex.printStackTrace(); } } } public void reportSystemCodes() { List<DataSource> sortedList = new ArrayList<DataSource>(); sortedList.addAll (DataSource.getDataSources()); Collections.sort (sortedList, new Comparator<DataSource>() { public int compare(DataSource a, DataSource b) { return a.getSystemCode().compareTo(b.getSystemCode()); }} ); for (DataSource ds : sortedList) { System.out.printf("%4s %-20s %-40s\n", ds.getSystemCode(), ds.getFullName(), ds.getExample().getId()); } } public void run(String[] args) { BioDataSource.init(); Settings settings = new Settings(); String error = parseArgs(settings, args); if (error != null) { System.err.println ("Error: " + error); printUsage(); System.exit(1); } try { Class.forName("org.bridgedb.file.IDMapperText"); Class.forName("org.bridgedb.rdb.IDMapperRdb"); } catch (ClassNotFoundException ex) { ex.printStackTrace(); //TODO: better exception handling } if (settings.mode == 0) { Mapper mapper = new Mapper( settings.connectStrings, settings.fInput, settings.fOutput, settings.fReport, settings.is, settings.os, settings.inputColumn, settings.verbose, settings.multiMap); mapper.run(); } else { reportSystemCodes(); } } }