package org.bridgedb.tools.qc;
import java.io.File;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.bridgedb.DataSource;
import org.bridgedb.IDMapperException;
import org.bridgedb.Xref;
import org.bridgedb.bio.DataSourceTxt;
import org.bridgedb.bio.Organism;
import org.bridgedb.rdb.SimpleGdb;
import org.bridgedb.rdb.SimpleGdbFactory;
/**
* Utility to do simple quality control on a BridgeDerby database.
* Run with two parameters: [old database] and [new database]
* Some basic comparisons will be done, which serves as a sanity check
* that not suddenly a whole identifier system has gone missing.
* <p>
* The script produces a report on stdout, lines starting with "INFO"
* are strictly informative, whereas lines starting with "WARNING" are
* problems worth investigating further. Ideally there are no "WARNING" lines
* in the report.
*/
public class BridgeQC
{
private final File oldDb;
private final File newDb;
private SimpleGdb oldGdb;
private SimpleGdb newGdb;
public BridgeQC(File f1, File f2) throws IDMapperException
{
oldDb = f1;
newDb = f2;
}
Map<DataSource, Integer> oldSet = new HashMap<DataSource, Integer>();
Map<DataSource, Integer> newSet = new HashMap<DataSource, Integer>();
public void initDatabases() throws IDMapperException
{
String url1 = "jdbc:derby:jar:(" + oldDb + ")database";
oldGdb = SimpleGdbFactory.createInstance("old", url1);
String url2 = "jdbc:derby:jar:(" + newDb + ")database";
newGdb = SimpleGdbFactory.createInstance("new", url2);
}
public void compareDataSources() throws IDMapperException
{
for (DataSource ds : oldGdb.getCapabilities().getSupportedSrcDataSources())
{
int oldGenes = oldGdb.getGeneCount(ds);
oldSet.put (ds, oldGenes);
}
for (DataSource ds : newGdb.getCapabilities().getSupportedSrcDataSources())
{
int newGenes = newGdb.getGeneCount(ds);
newSet.put (ds, newGenes);
}
// not in new
for (DataSource ds : oldSet.keySet())
{
if (!newSet.containsKey(ds))
{
System.out.println ("WARNING: " + ds.getSystemCode() + " is only in old database");
}
}
// not in old
for (DataSource ds : newSet.keySet())
{
int newGenes = newSet.get(ds);
if (newGenes == 0)
{
System.out.println ("WARNING: " + ds.getSystemCode() + " has 0 ids");
}
if (!oldSet.containsKey(ds))
{
System.out.println ("INFO: " + ds.getSystemCode() + " is only in new database");
System.out.printf ("INFO: Number of ids in %s: %d\n", ds.getSystemCode(), newGenes);
}
else
{
Set<String> oldIDs = new HashSet<String>();
for (Xref oldXref : oldGdb.getIterator(ds)) oldIDs.add(oldXref.getId());
Set<String> newIDs = new HashSet<String>();
for (Xref newXref : newGdb.getIterator(ds)) newIDs.add(newXref.getId());
// determine all new IDs
Set<String> newGenesAdded = new HashSet<String>();
newGenesAdded.addAll(newIDs);
newGenesAdded.removeAll(oldIDs);
// determine all no longer existing (removed) IDs
Set<String> genesRemoved = new HashSet<String>();
genesRemoved.addAll(oldIDs);
genesRemoved.removeAll(newIDs);
int oldGenes = oldSet.get(ds);
double delta = (double)(newGenes - oldGenes) / (double)oldGenes;
if (newGenesAdded.size() + genesRemoved.size() == 0)
System.out.printf(
"INFO: Number of ids in %s%s: %d (unchanged)\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : "",
newGenes
);
else
System.out.printf(
"INFO: Number of ids in %s%s: %d (%d added, %d removed -> overall changed %+3.1f%%)\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : "",
newGenes,
newGenesAdded.size(),
genesRemoved.size(),
(delta * 100)
);
if (genesRemoved.size() > 0 && "true".equals(System.getProperty("showRemovedIDs", "false")))
System.out.printf(
"INFO: The ids removed from %s%s: %s\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : "",
"" + genesRemoved
);
if (delta < -0.1)
System.out.println ("WARNING: Number of ids in " + ds.getSystemCode() + " has shrunk by more than 10%");
}
}
}
public void compareLinks() throws SQLException
{
Connection con = oldGdb.getConnection();
//TODO ... do something to compare cross-link consistency ...
}
public void checkDatabaseSanity() throws SQLException
{
Connection con = newGdb.getConnection();
Statement st = con.createStatement();
/** check for ids that occur in the link table but not in datanode table. We expect zero results */
String sql = "select coderight, idright from link left outer join datanode on link.idright = datanode.id and link.coderight = datanode.code where datanode.code IS NULL";
ResultSet rs = st.executeQuery(sql);
if (rs.next())
{
System.out.println ("ERROR: 'link' table contains ids that do not occur in 'datanode' table.");
System.out.print ("ERROR: A few examples: ");
String sep = "";
int i = 0;
do
{
System.out.print (sep + rs.getString(1) + ":" + rs.getString(2));
sep = ", ";
}
while (rs.next() && ++i < 8);
System.out.println();
System.out.println ("ERROR: These ids will not map properly.");
}
}
public void compareFileSizes() throws SQLException
{
long oldSize = oldDb.length();
long newSize = newDb.length();
System.out.printf ("INFO: new size is %d Mb (changed %+3.1f%%)\n", newSize / 1000000,
(double)(newSize - oldSize) / (double)oldSize * 100);
}
public void compareAttributes() throws IDMapperException
{
Set<String> oldAttrSet = oldGdb.getAttributeSet();
Set<String> newAttrSet = newGdb.getAttributeSet();
for (String oldAttr : oldAttrSet)
{
if (!newAttrSet.contains(oldAttr))
{
System.out.println ("WARNING: Attribute " + oldAttr + " only in old database");
}
}
for (String newAttr : newAttrSet)
{
System.out.println ("INFO: Attribute provided: " + newAttr);
if (!oldAttrSet.contains(newAttr))
{
System.out.println ("INFO: Attribute " + newAttr + " only in new database");
}
}
}
public static boolean safeEquals (Object a, Object b)
{
return a == null ? b == null : a.equals(b);
}
public interface PropertyChecker
{
abstract void check(String oldVal, String newVal);
}
enum Props implements PropertyChecker
{
ORGANISM (true, false) {
public void check(String oldVal, String newVal)
{
if (newVal != null)
{
Organism o = Organism.fromLatinName(newVal);
if (o == null) System.out.println ("WARNING: species '" + newVal + "' is not a recognized latin name");
}
}
},
DATASOURCENAME (true, true) {
public void check(String oldVal, String newVal) {}
},
SERIES (true, true) {
public void check(String oldVal, String newVal) {}
},
DATATYPE (true, true) {
public void check(String oldVal, String newVal) {}
},
DATASOURCEVERSION (false, true) {
public void check(String oldVal, String newVal) {}
},
BUILDDATE (false, true) {
public void check(String oldVal, String newVal) {
SimpleDateFormat sft = new SimpleDateFormat("yyyyMMdd");
Date oldDate = null;
Date newDate = null;
try
{
if (oldVal != null)
oldDate = sft.parse(oldVal);
}
catch (ParseException e)
{
System.out.println ("ERROR: " + oldVal + " does not match pattern yyyymmdd");
}
try
{
if (newVal != null)
newDate = sft.parse(newVal);
}
catch (ParseException e)
{
System.out.println ("ERROR: " + oldVal + " does not match pattern yyyymmdd");
}
if (oldDate != null && newDate != null && oldDate.after(newDate))
{
System.out.println ("ERROR: new date " + newVal + " is older than old date " + oldVal);
}
}
},
SCHEMAVERSION (false, true) {
public void check(String oldVal, String newVal) {}
},
;
private boolean mustBeSame;
private boolean mustBeDefined;
Props(boolean mustBeSame, boolean mustBeDefined)
{
this.mustBeSame = mustBeSame;
this.mustBeDefined = mustBeDefined;
}
public void checkWrap(String oldVal, String newVal)
{
if (mustBeSame && !safeEquals (oldVal, newVal))
System.out.println ("WARNING: old " + name() + " '" + oldVal + "' doesn\'t match new " + name() + " '" + newVal + "'");
if (mustBeDefined && (newVal == null || newVal.equals("")))
System.out.println ("WARNING: property " + name() + " is undefined");
check(oldVal, newVal);
}
}
public void compareInfo()
{
for (Props p : Props.values())
{
p.checkWrap(oldGdb.getCapabilities().getProperty(p.name()),
newGdb.getCapabilities().getProperty(p.name()));
}
}
public void run() throws IDMapperException, SQLException
{
initDatabases();
checkDatabaseSanity();
compareInfo();
compareDataSources();
compareLinks();
compareAttributes();
compareFileSizes();
summarizeOverallStats();
}
private void summarizeOverallStats() throws IDMapperException
{
System.out.println("INFO: total number of identifiers is " + newGdb.getGeneCount());
System.out.println("INFO: total number of mappings is " + newGdb.getLinkCount());
}
public static void printUsage()
{
System.out.println ("Expected 2 arguments: <old database> <new database>");
}
/**
* @param args
* @throws IDMapperException
* @throws SQLException
*/
public static void main(String[] args) throws IDMapperException, SQLException
{
if (args.length != 2) { printUsage(); return; }
BridgeQC main = new BridgeQC (new File(args[0]), new File(args[1]));
DataSourceTxt.init();
main.run();
PatternChecker checker = new PatternChecker();
checker.run(new File(args[0]));
}
}