package org.nextprot.api.core.app.daganalyser; import grph.Grph; import grph.path.Path; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.log4j.Logger; import org.github.jamm.MemoryMeter; import org.nextprot.api.commons.constants.TerminologyCv; import org.nextprot.api.commons.utils.app.CommandLineSpringParser; import org.nextprot.api.commons.utils.app.ConsoleProgressBar; import org.nextprot.api.commons.utils.app.SpringBasedApp; import org.nextprot.api.core.domain.CvTerm; import org.nextprot.api.core.service.TerminologyService; import org.nextprot.api.core.utils.TerminologyUtils; import org.nextprot.api.core.utils.graph.OntologyDAG; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.text.DecimalFormat; import java.time.Instant; import java.time.temporal.ChronoUnit; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; /** * This app analyses the graph of all ontologies referenced by neXtProt * * <h3>About estimating Java Object Sizes with Instrumentation</h3> * Setting jamm as -javaagent is now optional * If instrumentation is available, use it, otherwise guess the size using sun.misc.Unsafe; if that is unavailable, * guess using predefined specifications * <pre>-javaagent: $path/jamm/target/jamm-0.3.2-SNAPSHOT.jar</pre> */ public class OntologyDAGAnalyserApp extends SpringBasedApp<OntologyDAGAnalyserApp.ArgumentParser> { private static final Logger LOGGER = Logger.getLogger(OntologyDAGAnalyserApp.class); private final static DecimalFormat DECIMAL_FORMAT = new DecimalFormat("#.##"); private TerminologyService terminologyService; private TerminologyCv[] terminologyCvs; private OntologyDAGAnalyserApp(String[] args) throws ParseException { super(args); terminologyCvs = TerminologyCv.values(); //terminologyCvs = new TerminologyCv[] {TerminologyCv.NciThesaurusCv}; } @Override public ArgumentParser newCommandLineParser() { return new ArgumentParser(OntologyDAGAnalyserApp.class.getSimpleName()); } @Override protected void execute() throws IOException { terminologyService = getBean(TerminologyService.class); System.out.println("*** write to cache timings..."); readWriteCache(false); System.out.println("*** access to cache timings..."); readWriteCache(true); System.out.println("*** calculate statistics..."); calcStatisticsForAllOntologies(); } private void calcStatisticsForAllOntologies() throws FileNotFoundException { Set<TerminologyCv> excludedOntology = EnumSet.of( TerminologyCv.NextprotCellosaurusCv, TerminologyCv.MeshAnatomyCv, TerminologyCv.MeshCv); PrintWriter pw = new PrintWriter(getCommandLineParser().getOutputDirectory()+"/dag-ontology.csv"); pw.write(getStatisticsHeaders().stream().collect(Collectors.joining(","))); pw.write(",building time (ms),TerminologyUtils.getAllAncestors() time (ms),OntologyDAG.getAncestors() time (ms)\n"); for (TerminologyCv terminologyCv : terminologyCvs) { Instant t1 = Instant.now(); // no cache here: create a new instance to access graph advanced methods OntologyDAG graph = new OntologyDAG(terminologyCv, terminologyService); long buildingTime = ChronoUnit.MILLIS.between(t1, Instant.now()); try { List<String> statistics = calcStatistics(graph, terminologyCv); statistics.add(new DecimalFormat("######.##").format(buildingTime)); if (!excludedOntology.contains(terminologyCv)) { statistics.addAll(benchmarkingGetAncestorsMethods(terminologyCv, terminologyService).stream().map(l -> Long.toString(l)).collect(Collectors.toList())); } else { statistics.addAll(Arrays.asList("NA", "NA")); } pw.write(statistics.stream().collect(Collectors.joining(","))); pw.write("\n"); pw.flush(); } catch (OntologyDAG.NotFoundInternalGraphException e) { throw new IllegalStateException(e); } } pw.close(); } private static List<String> getStatisticsHeaders() { return Arrays.asList("terminology", "nodes#", "edges#", "connected components#", "cycles#", "avg in-degree#", "avg out-degree#", "all paths#", "graph memory (KB)", "cv id to ancestors id memory (KB)", "cv id to accession memory (KB)", "precomputing time (ms)"); } private List<String> calcStatistics(OntologyDAG graph, TerminologyCv ontology) throws OntologyDAG.NotFoundInternalGraphException { // 1. git clone https://github.com/jbellis/jamm.git <path to>/ ; cd <path to>/jamm ; ant jar ; add dependency to this jar // 2. start the JVM with "-javaagent:<path to>/jamm.jar" MemoryMeter memMeter = new MemoryMeter().withGuessing(MemoryMeter.Guess.FALLBACK_BEST); long wholeGraphMemory = memMeter.measureDeep(graph); long ancestorsMemory = memMeter.measureDeep(graph.getCvTermIdAncestors()); long cvTermIdAccessionMemory = memMeter.measureDeep(graph.getCvTermIdByAccession()); Collection<Path> allPaths = graph.getAllPathsFromTransientGraph(); Instant t1 = Instant.now(); ConsoleProgressBar pb = ConsoleProgressBar.determinated(allPaths.size()); pb.setTaskName(ontology+ " paths"); pb.start(); for (Path path : allPaths) { graph.isAncestorOf(path.getSource(), path.getDestination()); pb.incrementValue(); } pb.stop(); long ms = ChronoUnit.MILLIS.between(t1, Instant.now()); Set<Path> cycles = graph.getAllCyclesFromTransientGraph(); if (!cycles.isEmpty()) { System.err.println("ERROR IN "+ontology + ": found "+cycles.size()+" cycles: "+cycles.stream() .map(path -> Arrays.stream(path.toVertexArray()) .boxed() .map(graph::getCvTermAccessionById) .collect(Collectors.joining(" > "))) .collect(Collectors.joining(", "))); } List<Number> stats = Arrays.asList(graph.countNodes(), graph.countEdgesFromTransientGraph(), graph.getConnectedComponentsFromTransientGraph().count(), cycles.size(), graph.getAverageDegreeFromTransientGraph(Grph.TYPE.vertex, Grph.DIRECTION.in), graph.getAverageDegreeFromTransientGraph(Grph.TYPE.vertex, Grph.DIRECTION.out), allPaths.size(), (int)Math.ceil(wholeGraphMemory/1024.), (int)Math.ceil(ancestorsMemory/1024.), (int)Math.ceil(cvTermIdAccessionMemory/1024.), ms); return Stream.concat(Stream.of(graph.getTerminologyCv().name()), stats.stream().map(DECIMAL_FORMAT::format)).collect(Collectors.toList()); } private void readWriteCache(boolean readCacheForSure) { Set<String> allCvTerms = new HashSet<>(); ConsoleProgressBar pb = ConsoleProgressBar.determinated(terminologyCvs.length); pb.setTaskName(((readCacheForSure) ? "read":"read/write")+" terminology-by-ontology cache"); pb.start(); Instant t = Instant.now(); for (TerminologyCv ontology : terminologyCvs) { allCvTerms.addAll(terminologyService.findCvTermsByOntology(ontology.name()).stream() .map(CvTerm::getAccession) .collect(Collectors.toSet())); pb.incrementValue(); } pb.stop(); System.out.println("\ttiming 'terminology-by-ontology': "+ChronoUnit.SECONDS.between(t, Instant.now()) + " s"); pb = ConsoleProgressBar.determinated(terminologyCvs.length); pb.setTaskName(((readCacheForSure) ? "read":"read/write")+" 'ontology-dag' cache"); pb.start(); t = Instant.now(); for (TerminologyCv ontology : terminologyCvs) { terminologyService.findOntologyGraph(ontology); pb.incrementValue(); } pb.stop(); System.out.println("\ttiming 'ontology-dag': "+ChronoUnit.SECONDS.between(t, Instant.now()) + " s"); pb = ConsoleProgressBar.determinated(allCvTerms.size()); pb.setTaskName(((readCacheForSure) ? "read":"read/write")+" 'terminology-by-accession' cache"); pb.start(); t = Instant.now(); for (String cvTerm : allCvTerms) { terminologyService.findCvTermByAccession(cvTerm); pb.incrementValue(); } pb.stop(); System.out.println("\ttiming 'terminology-by-accession': "+ChronoUnit.SECONDS.between(t, Instant.now()) + " s"); } private List<Long> benchmarkingGetAncestorsMethods(TerminologyCv terminologyCv, TerminologyService terminologyService) { List<Long> timings = new ArrayList<>(); OntologyDAG graph = new OntologyDAG(terminologyCv, terminologyService); Map<Long, List<String>> ancestors = new HashMap<>(); Map<Long, List<String>> ancestorsQuick = new HashMap<>(); List<CvTerm> cvTerms = terminologyService.findCvTermsByOntology(terminologyCv.name()); // COMPARE COMPUTATION DURATIONS Instant t = Instant.now(); for (CvTerm cvTerm : cvTerms) { ancestors.put(cvTerm.getId(), TerminologyUtils.getAllAncestorsAccession(cvTerm.getAccession(), terminologyService)); } timings.add(ChronoUnit.MILLIS.between(t, Instant.now())); t = Instant.now(); for (CvTerm cvTerm : cvTerms) { ancestorsQuick.put(cvTerm.getId(), Arrays.stream(graph.getAncestors(cvTerm.getId())).boxed() .map(graph::getCvTermAccessionById) .collect(Collectors.toList())); } timings.add(ChronoUnit.MILLIS.between(t, Instant.now())); // TEST CORRECTNESS Set<Long> ids = ancestors.keySet(); for (long id : ids) { Set<String> ancestorsOld = new HashSet<>(ancestors.get(id)); Set<String> ancestorsNew = new HashSet<>(ancestorsQuick.get(id)); boolean equals = ancestorsOld.equals(ancestorsNew); if (!equals) { System.err.println("WARNING: INCONSISTENCY: found different ancestors for cv term "+graph.getCvTermAccessionById(id) + "\n\t: old="+ ancestors.get(id) + "\n" + "\t: new="+ ancestorsNew); } } return timings; } /** * Parse arguments and provides MainConfig object * * Created by fnikitin on 09/08/16. */ static class ArgumentParser extends CommandLineSpringParser { private String outputDirectory; public ArgumentParser(String appName) { super(appName); } @Override protected Options createOptions() { Options options = super.createOptions(); //noinspection AccessStaticViaInstance options.addOption(OptionBuilder.withArgName("out").hasArg().withDescription("output directory").create("o")); return options; } @Override protected void parseOtherParams(CommandLine commandLine) { outputDirectory = (commandLine.hasOption("o")) ? commandLine.getOptionValue("o") : "./"; } public String getOutputDirectory() { return outputDirectory; } } /** * @param args contains mandatory and optional arguments * Mandatory : export-dir-path * Optional : * -p profile (by default: dev, cache) * -o output directory (/tmp by default) */ public static void main(String[] args) { try { new OntologyDAGAnalyserApp(args).run(); } catch(Exception e) { LOGGER.error(e.getMessage()+": exiting app"); e.printStackTrace(); System.exit(1); } } }