package org.chesmapper.view.cluster; import java.awt.BorderLayout; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.Set; import javax.swing.JFileChooser; import javax.swing.JOptionPane; import javax.swing.JPanel; import javax.swing.border.EmptyBorder; import org.chesmapper.map.alg.embed3d.ThreeDEmbedder; import org.chesmapper.map.alg.embed3d.WekaPCA3DEmbedder; import org.chesmapper.map.alg.embed3d.r.Sammon3DEmbedder; import org.chesmapper.map.data.ClusteringData; import org.chesmapper.map.data.DatasetFile; import org.chesmapper.map.dataInterface.CompoundData; import org.chesmapper.map.dataInterface.CompoundProperty; import org.chesmapper.map.dataInterface.CompoundPropertyUtil; import org.chesmapper.map.dataInterface.NominalProperty; import org.chesmapper.map.dataInterface.NumericProperty; import org.chesmapper.map.main.CheSMapping; import org.chesmapper.map.main.Settings; import org.chesmapper.map.workflow.MappingWorkflow; import org.chesmapper.map.workflow.MappingWorkflow.DescriptorSelection; import org.chesmapper.map.workflow.MappingWorkflow.FragmentSettings; import org.chesmapper.view.gui.LaunchCheSMapper; import org.mg.javalib.gui.Message; import org.mg.javalib.gui.MessageLabel; import org.mg.javalib.io.SDFUtil; import org.mg.javalib.util.ArrayUtil; import org.mg.javalib.util.DoubleKeyHashMap; import org.mg.javalib.util.FileUtil; import org.mg.javalib.util.ObjectUtil; import org.mg.javalib.util.StringUtil; public class ExportData { public static void exportAll(Clustering clustering, CompoundProperty compoundDescriptorFeature, Script script) { List<Integer> l = new ArrayList<Integer>(); for (Compound m : clustering.getCompounds(false)) l.add(m.getOrigIndex()); exportCompoundsWithOrigIndices(clustering, l, compoundDescriptorFeature, script); } public static void exportClusters(Clustering clustering, int clusterIndices[], CompoundProperty compoundDescriptorFeature) { List<Integer> l = new ArrayList<Integer>(); for (int i = 0; i < clusterIndices.length; i++) for (Compound m : clustering.getCluster(clusterIndices[i]).getCompounds()) l.add(m.getOrigIndex()); exportCompoundsWithOrigIndices(clustering, l, compoundDescriptorFeature, null); } public static void exportCompoundsWithOrigIndices(Clustering clustering, List<Integer> compoundOrigIndices, CompoundProperty compoundDescriptorFeature) { exportCompoundsWithOrigIndices(clustering, ArrayUtil.toPrimitiveIntArray(compoundOrigIndices), compoundDescriptorFeature, null); } public static void exportCompoundsWithOrigIndices(Clustering clustering, List<Integer> compoundOrigIndices, CompoundProperty compoundDescriptorFeature, Script script) { exportCompoundsWithOrigIndices(clustering, ArrayUtil.toPrimitiveIntArray(compoundOrigIndices), compoundDescriptorFeature, script); } public static void exportCompoundsWithOrigIndices(Clustering clustering, int compoundOrigIndices[], CompoundProperty compoundDescriptorFeature) { exportCompoundsWithOrigIndices(clustering, compoundOrigIndices, compoundDescriptorFeature, null); } public static class Script { String dest; boolean allFeatures; public boolean skipEqualValues; public double skipNullValueRatio; public Script(String dest, boolean allFeatures, boolean skipEqualValues, double skipNullValueRatio) { this.dest = dest; this.allFeatures = allFeatures; this.skipEqualValues = skipEqualValues; this.skipNullValueRatio = skipNullValueRatio; } } public static void exportCompoundsWithOrigIndices(Clustering clustering, int compoundOrigIndices[], CompoundProperty compoundDescriptorFeature, Script script) { CompoundProperty selectedProps[]; if (script != null && script.allFeatures) { List<CompoundProperty> availableProps = new ArrayList<CompoundProperty>(); for (CompoundProperty p : clustering.getProperties()) availableProps.add(p); for (CompoundProperty p : clustering.getFeatures()) availableProps.add(p); for (CompoundProperty p : clustering.getAdditionalProperties()) if (p instanceof DistanceToProperty) availableProps.add(p); // when scripting do not add embedding stress if (availableProps.size() == 0) // no features to select selectedProps = new CompoundProperty[0]; selectedProps = ArrayUtil.toArray(CompoundProperty.class, availableProps); } else { selectedProps = ArrayUtil.toArray(CompoundProperty.class, clustering.selectPropertiesAndFeaturesWithDialog( "Select features for SDF/CSV export", null, true, true, true, true)); if (selectedProps == null)//pressed cancel return; } DoubleKeyHashMap<Integer, String, Object> featureValues = new DoubleKeyHashMap<Integer, String, Object>(); for (Integer j : compoundOrigIndices) { if (clustering.numClusters() > 1) { if (clustering.isClusterAlgorithmDisjoint()) { Compound m = null; for (Cluster c : clustering.getClusters()) for (Compound mm : c.getCompounds()) if (mm.getOrigIndex() == j) { m = mm; break; } featureValues.put(j, (clustering.getClusterAlgorithm() + " cluster assignement").replace(' ', '_'), clustering.getClusterIndexForCompound(m)); } else { for (Cluster c : clustering.getClusters()) { if (!c.containsNotClusteredCompounds()) { Compound m = null; for (Compound mm : c.getCompounds()) if (mm.getOrigIndex() == j) { m = mm; break; } featureValues.put(j, (clustering.getClusterAlgorithm() + " " + c.getName()).replace(' ', '_'), m == null ? 0 : 1); } } } } for (CompoundProperty p : selectedProps) if (!p.getName().matches("(?i)smiles")) { String prop = CompoundPropertyUtil.propToExportString(p); Object val; if (p instanceof NumericProperty) { val = clustering.getCompounds().get(j).getDoubleValue((NumericProperty) p); if (val != null && ((NumericProperty) p).isInteger()) val = StringUtil.formatDouble((Double) val, 0); } else val = clustering.getCompounds().get(j).getStringValue((NominalProperty) p); if (val == null) val = ""; featureValues.put(j, prop, val); } } List<String> skipRedundant = new ArrayList<String>(); for (CompoundProperty p : CompoundPropertyUtil.getRedundantFeatures(ArrayUtil.toList(selectedProps), compoundOrigIndices).keySet()) skipRedundant.add(CompoundPropertyUtil.propToExportString(p)); List<String> skipUniform = new ArrayList<String>(); List<String> skipNull = new ArrayList<String>(); List<String> skip = new ArrayList<String>(); if (featureValues.keySet1().size() > 0 && featureValues.keySet2(compoundOrigIndices[0]).size() > 0) for (String prop : featureValues.keySet2(compoundOrigIndices[0])) { boolean uniform = true; int nullValueCount = 0; boolean first = true; Object val = null; for (Integer j : compoundOrigIndices) { Object newVal = featureValues.get(j, prop); if ((newVal == null || newVal.equals("") || new Double(Double.NaN).equals(newVal))) nullValueCount++; if (first) { first = false; val = newVal; } else { if (!ObjectUtil.equals(val, newVal)) uniform = false; } } if (uniform && compoundOrigIndices.length > 1) skipUniform.add(prop); if (nullValueCount > 0) { double ratio = 0; if (compoundOrigIndices.length > 1) ratio = nullValueCount / (double) compoundOrigIndices.length; if (script == null || ratio > script.skipNullValueRatio) { if (script != null) Settings.LOGGER.info("null value ratio " + ratio + " > " + script.skipNullValueRatio + ", skipping from export: " + prop + " "); skipNull.add(prop); } } } if (skipUniform.size() > 0) { boolean doSkip; if (script != null) doSkip = script.skipEqualValues; else { String msg = skipUniform.size() + "/" + featureValues.keySet2(compoundOrigIndices[0]).size() + " feature/s have equal values for each compound.\nThese feature/s contain no information to distiguish between compounds.\nSkip from export?"; int sel = JOptionPane.showConfirmDialog(Settings.TOP_LEVEL_FRAME, msg, "Skip feature", JOptionPane.YES_NO_OPTION); doSkip = sel == JOptionPane.YES_OPTION; } if (doSkip) for (String p : skipUniform) { if (skipNull.contains(p)) skipNull.remove(p); if (skipRedundant.contains(p)) skipRedundant.remove(p); Settings.LOGGER.info("uniform values, skipping from export: " + p + " "); skip.add(p); } } if (skipNull.size() > 0) { boolean doSkip; if (script != null) doSkip = true; else { String msg = skipNull.size() + "/" + (featureValues.keySet2(compoundOrigIndices[0]).size() - skip.size()) + " feature/s have missing values.\nMissing values might cause problems when post-processing the exported compounds.\nSkip from export?"; int sel = JOptionPane.showConfirmDialog(Settings.TOP_LEVEL_FRAME, msg, "Skip feature", JOptionPane.YES_NO_OPTION); doSkip = sel == JOptionPane.YES_OPTION; } if (doSkip) for (String p : skipNull) { if (script == null) Settings.LOGGER.info("null values, skipping from export: " + p + " "); if (skipRedundant.contains(p)) skipRedundant.remove(p); skip.add(p); } } if (skipRedundant.size() > 0) { boolean doSkip; if (script != null) doSkip = Settings.SKIP_REDUNDANT_FEATURES; else { String msg = skipRedundant.size() + "/" + (featureValues.keySet2(compoundOrigIndices[0]).size() - skip.size()) + " feature/s are redundant.\nThe information encoded in these feature/s is already provided by other feature/s.\nSkip from export?"; int sel = JOptionPane.showConfirmDialog(Settings.TOP_LEVEL_FRAME, msg, "Skip feature", JOptionPane.YES_NO_OPTION); doSkip = sel == JOptionPane.YES_OPTION; } if (doSkip) for (String p : skipRedundant) { if (script == null) Settings.LOGGER.info("redundant values, skipping from export: " + p + " "); skip.add(p); } } for (String p : skip) { for (Integer j : compoundOrigIndices) featureValues.remove(j, p); } String dest; if (script != null) dest = script.dest; else { String dir = clustering.getOrigLocalPath(); if (dir == null) dir = System.getProperty("user.home"); JFileChooser f = new JFileChooser(dir);//origSDFFile); f.setDialogTitle("Save to SDF/CSV file (according to filename extension)"); JPanel p = new JPanel(new BorderLayout()); MessageLabel m = new MessageLabel( Message.infoMessage("By default the data is exportet in SDF format. Add '.csv' to the filename to export in CSV format.")); m.setBorder(new EmptyBorder(5, 5, 5, 5)); p.add(m, BorderLayout.NORTH); f.setAccessory(p); // MessagePanel p = new MessagePanel(); // p.add int i = f.showSaveDialog(Settings.TOP_LEVEL_FRAME); if (i != JFileChooser.APPROVE_OPTION) return; dest = f.getSelectedFile().getAbsolutePath(); if (!f.getSelectedFile().exists() && !FileUtil.getFilenamExtension(dest).matches("(?i)sdf") && !FileUtil.getFilenamExtension(dest).matches("(?i)csv")) dest += ".sdf"; if (new File(dest).exists()) { if (JOptionPane.showConfirmDialog(Settings.TOP_LEVEL_FRAME, "File '" + dest + "' already exists, overwrite?", "Warning", JOptionPane.YES_NO_OPTION, JOptionPane.WARNING_MESSAGE) != JOptionPane.YES_OPTION) return; } } boolean csvExport = FileUtil.getFilenamExtension(dest).matches("(?i)csv"); // file may be overwritten, and then reloaded -> clear DatasetFile.clearFiles(dest); if (csvExport) { List<String> feats = new ArrayList<String>(); for (Integer j : compoundOrigIndices) if (featureValues.keySet1().size() > 0 && featureValues.keySet2(j) != null) for (String feat : featureValues.keySet2(j)) if (!feats.contains(feat)) feats.add(feat); File file = new File(dest); try { BufferedWriter b = new BufferedWriter(new FileWriter(file)); Set<String> featNames = new HashSet<String>(); b.write("\"SMILES\""); for (Object feat : feats) { b.write(",\""); String featName = feat.toString(); int mult = 2; while (featNames.contains(featName)) featName = feat.toString() + "_" + (mult++); featNames.add(featName); b.write(featName); b.write("\""); } b.write("\n"); for (Integer compoundIndex : compoundOrigIndices) { CompoundData c = clustering.getCompounds().get(compoundIndex); b.write("\""); b.write(c.getSmiles()); b.write("\""); for (String feat : feats) { b.write(",\""); Object val = featureValues.get(compoundIndex, feat); String s = val == null ? "" : val.toString(); if (s.contains("\"")) { System.err.println("csv export: replacing \" with ' for feature " + feat + " and value " + s); s = s.replace('"', '\''); } b.write(s); b.write("\""); } b.write("\n"); } b.close(); } catch (IOException e) { throw new Error(e); } } else { HashMap<Integer, Object> newTitle = null; if (compoundDescriptorFeature != null) { newTitle = new HashMap<Integer, Object>(); for (Integer j : compoundOrigIndices) { Object val; if (compoundDescriptorFeature instanceof NumericProperty) { val = clustering.getCompounds().get(j) .getDoubleValue((NumericProperty) compoundDescriptorFeature); if (val != null && ((NumericProperty) compoundDescriptorFeature).isInteger()) val = StringUtil.formatDouble((Double) val, 0); } else val = clustering.getCompounds().get(j) .getStringValue((NominalProperty) compoundDescriptorFeature); if (val == null) val = ""; featureValues.put(j, CompoundPropertyUtil.propToExportString(compoundDescriptorFeature), val); newTitle.put(j, val); } } SDFUtil.filter(clustering.getOrigSDFile(), dest, compoundOrigIndices, featureValues, true, newTitle); //SDFUtil.filter(clustering.getSDFile(), dest, compoundOrigIndices, featureValues, true, newTitle); } String msg = "Successfully exported " + compoundOrigIndices.length + " compounds to\n" + dest; if (script != null) System.out.println("\n" + msg); else JOptionPane .showMessageDialog(Settings.TOP_LEVEL_FRAME, msg, "Export done", JOptionPane.INFORMATION_MESSAGE); } public static void scriptExport(String datasetFile, DescriptorSelection features, FragmentSettings fragmentSettings, String outfile, boolean keepUniform, double missingRatio, List<Integer> distanceToCompounds, boolean euclideanDistance) { ThreeDEmbedder embed = null; if (distanceToCompounds != null && euclideanDistance) embed = WekaPCA3DEmbedder.INSTANCE; else if (distanceToCompounds != null && !euclideanDistance) { embed = Sammon3DEmbedder.INSTANCE; ((Sammon3DEmbedder) embed).enableTanimoto(); } Properties props = MappingWorkflow.createMappingWorkflow(datasetFile, features, fragmentSettings, null, embed); CheSMapping mapping = MappingWorkflow.createMappingFromMappingWorkflow(props, ""); ClusteringData clusteringData = mapping.doMapping(); ClusteringImpl clustering = new ClusteringImpl(); clustering.newClustering(clusteringData); if (distanceToCompounds != null) for (Integer i : distanceToCompounds) clustering.addDistanceToCompoundFeature(clustering.getCompoundWithJmolIndex(i)); scriptExport(clustering, outfile, keepUniform, missingRatio); } public static void scriptExport(Clustering clustering, String outfile, boolean keepUniform, double missingRatio) { // LaunchCheSMapper.start(mapping); // while (CheSViewer.getFrame() == null || CheSViewer.getClustering() == null) // ThreadUtil.sleep(100); // ExportData.exportAll(CheSViewer.getClustering(), null, new Script(outfile, true, true, true)); ExportData.exportAll(clustering, null, new Script(outfile, true, !keepUniform, missingRatio)); //LaunchCheSMapper.exit(CheSViewer.getFrame()); LaunchCheSMapper.exit(null); } public static void main(String[] args) { LaunchCheSMapper.init(); //String input = "/home/martin/data/valium.csv"; String input = "/home/martin/data/caco2.sdf"; scriptExport(input, DescriptorSelection.autoSelectIntegrated(), null, "/tmp/data.csv", false, 0.1, null, false); } }