BeautiAlignmentProvider.java example

Explorer
beast2-master
- src
package beast.app.beauti;





import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.swing.JComboBox;
import javax.swing.JOptionPane;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import beast.core.BEASTInterface;
import beast.core.BEASTObject;
import beast.core.Description;
import beast.core.Input;
import beast.core.Input.Validate;
import beast.evolution.alignment.Alignment;
import beast.evolution.alignment.FilteredAlignment;
import beast.evolution.alignment.Sequence;
import beast.evolution.datatype.DataType;
import beast.math.distributions.MRCAPrior;
import beast.util.AddOnManager;
import beast.util.NexusParser;
import beast.util.XMLParser;


@Description("Class for creating new alignments to be edited by AlignmentListInputEditor")
public class BeautiAlignmentProvider extends BEASTObject {
	/** map extension to importer class names **/
	static List<AlignmentImporter> importers = null;
    /**
     * directory to pick up importers from *
     */
    final static String[] IMPLEMENTATION_DIR = {"beast.app"};

	private void initImporters() {
		importers = new ArrayList<>();
        // add standard importers
		importers.add(new NexusImporter());
		importers.add(new XMLImporter());
       	importers.add(new FastaImporter());

        // build up list of data types
        List<String> importerClasses = AddOnManager.find(AlignmentImporter.class, IMPLEMENTATION_DIR);
        for (String _class: importerClasses) {
        	try {
        		if (!_class.startsWith(this.getClass().getName())) {
					AlignmentImporter importer = (AlignmentImporter) Class.forName(_class).newInstance();
					importers.add(importer);
        		}
			} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
        }
        
	}

	final public Input<BeautiSubTemplate> template = new Input<>("template", "template to be used after creating a new alignment. ", Validate.REQUIRED);
	
	@Override
	public void initAndValidate() {
	}
	
	/** 
	 * return amount to which the provided matches an alignment 
	 * The provider with the highest match will be used to edit the alignment 
	 * */
	protected int matches(Alignment alignment) {
		return 1;
	}
	
	/** 
	 * return new alignment, return null if not successful 
	 * **/
	protected List<BEASTInterface> getAlignments(BeautiDoc doc) {
		if (importers == null) {
			initImporters();
		}
		Set<String> extensions = new HashSet<>();
		for (AlignmentImporter importer : importers) {
			for (String extension : importer.getFileExtensions()) {
				extensions.add(extension);
			}
		}
        File [] files = beast.app.util.Utils.getLoadFiles("Load Alignment File",
                new File(Beauti.g_sDir), "Alignment files", extensions.toArray(new String[]{}));
        if (files != null && files.length > 0) {
            return getAlignments(doc, files);
        }
		return null;
	}

    /**
     * return new alignment given files
     * @param doc
     * @param files
     * @return
     */
    public List<BEASTInterface> getAlignments(BeautiDoc doc, File[] files) {
		if (files == null) {
			// merge "+ button" and "drag drop" function
			return getAlignments(doc);
		}
		if (importers == null) {
			initImporters();
		}
        List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
        List<MRCAPrior> calibrations = new ArrayList<>();
        for (File file : files) {
			// create list of importers that can handle the file
			List<AlignmentImporter> availableImporters = new ArrayList<>();
			for (AlignmentImporter importer : importers) {
				if (importer.canHandleFile(file)) {
					availableImporters.add(importer);
				}
			}
			
			if (availableImporters.size() > 0) {
				AlignmentImporter importer = availableImporters.get(0);
				if (availableImporters.size() > 1) {
					// let user choose an importer
					List<String> descriptions = new ArrayList<>();
					for (AlignmentImporter i : availableImporters) {
						descriptions.add(((BEASTInterface)i).getDescription());
					}
					String option = (String)JOptionPane.showInputDialog(null, "Which importer is appropriate", "Option",
		                    JOptionPane.WARNING_MESSAGE, null, descriptions.toArray(), descriptions.get(0));
					if (option == null) {
						return selectedBEASTObjects;
					}
					int i = descriptions.indexOf(option);
					importer = availableImporters.get(i);
				}
				
				// get a fresh instance
				//try {
				//	importer = importer.getClass().newInstance();
				//} catch (InstantiationException | IllegalAccessException e) {
				//	// TODO Auto-generated catch block
				//	e.printStackTrace();
				//}
				List<BEASTInterface> list = importer.loadFile(file);
				selectedBEASTObjects.addAll(list);
			} else {
                JOptionPane.showMessageDialog(null,
                        "Unsupported sequence file.",
                        "Error", JOptionPane.ERROR_MESSAGE);
			}
			
        }
        addAlignments(doc, selectedBEASTObjects);
        if (calibrations != null) {
        	selectedBEASTObjects.addAll(calibrations);
        }
        // doc.addMRCAPriors(calibrations);
        return selectedBEASTObjects;
    }
    
    /** this allows subclasses of BeautiAlignmentProvider to be called with pre-defined arguments
     * for example from a scripting environment (see CompactAnalysis in BEASTLabs). The subclass
     * can choose to suppress GUI components.
     * Typical usage is for importing alignments using a standard template. 
     */
    public List<BEASTInterface> getAlignments(BeautiDoc doc, File[] files, String [] args) {
    	List<BEASTInterface> selectedBEASTObjects = getAlignments(doc, files);
    	return selectedBEASTObjects;
    }
        
    protected void addAlignments(BeautiDoc doc, List<BEASTInterface> selectedBEASTObjects) {
        for (BEASTInterface beastObject : selectedBEASTObjects) {
        	if (beastObject instanceof Alignment) {
	        	// ensure ID of alignment is unique
	        	int k = 0;
	        	String id = beastObject.getID();
        		boolean found = true;
	        	while (doc.pluginmap.containsKey(id) && found) {
	        		found = false;
	        		for (Alignment data : doc.alignments) {
	        			if (data.getID().equals(beastObject.getID())) {
	        				found = true;
	        				break;
	        			}
	        		}
	        		if (found) {
		        		k++;
		        		id = beastObject.getID() + k;	        			
	        		} else {
	        			BEASTInterface oldData = doc.pluginmap.get(beastObject.getID());
	        			replaceItem(doc, oldData, beastObject);
	        		}
	        	}
	        	beastObject.setID(id);
	        	sortByTaxonName(((Alignment) beastObject).sequenceInput.get());
	        	if (getStartTemplate() != null) {
	        		doc.addAlignmentWithSubnet((Alignment) beastObject, getStartTemplate());
	        	}
        	}
        }
    }
    
    @SuppressWarnings({ "rawtypes", "unchecked" })
	private void replaceItem(BeautiDoc doc, BEASTInterface oldData, BEASTInterface newData) {
        doc.pluginmap.remove(newData.getID());
        Set<BEASTInterface> outputs = new LinkedHashSet<>();
        outputs.addAll(oldData.getOutputs()); 
        for (BEASTInterface o : outputs) {
        	for ( Input i : o.listInputs()) {
        		if (i.get() == oldData) {
        			i.setValue(newData, o);
        		} else if (i.get() != null && i.get() instanceof List) {
        			List list = (List) i.get();
        			int index = list.indexOf(oldData);
        			if (index >= 0) {
        				list.set(index, newData);
        				newData.getOutputs().add(o);
        			}
        		}
        	}
        }		
	}

	/** provide GUI for manipulating the alignment **/
	void editAlignment(Alignment alignment, BeautiDoc doc) {
		try {
			AlignmentViewer viewer = new AlignmentViewer(alignment);
			viewer.showInDialog();
		} catch (Exception e) {
			JOptionPane.showMessageDialog(null, "Something went wrong viewing the alignment: " + e.getMessage());
			e.printStackTrace();
		}
	}
	
	/** check validity of the alignment, 
	 * return null if there are no problens, 
	 * return message string if something is fishy **/
	String validateAlignment() {
		return null;
	}
	
	/** return template to apply to this new alignment.
	 * By default, the partition template of the current beauti template is returned **/
	protected BeautiSubTemplate getStartTemplate() {
		return template.get();
	}

    protected void sortByTaxonName(List<Sequence> seqs) {
        Collections.sort(seqs, (Sequence o1, Sequence o2) -> {
                return o1.taxonInput.get().compareTo(o2.taxonInput.get());
            }
        );
    }

	static public BEASTInterface getXMLData(File file) {
		String xml = "";
		try {
			// parse as BEAST 2 xml fragment
			XMLParser parser = new XMLParser();
			BufferedReader fin = new BufferedReader(new FileReader(file));
			while (fin.ready()) {
				xml += fin.readLine() + "\n";
			}
			fin.close();
			BEASTInterface runnable = parser.parseBareFragment(xml, false);
			BEASTInterface alignment = getAlignment(runnable);
            alignment.initAndValidate();
            return alignment;
		} catch (Exception ex) {
			// attempt to parse as BEAST 1 xml
			try {
				String ID = file.getName();
				ID = ID.substring(0, ID.lastIndexOf('.')).replaceAll("\\..*", "");
				BEASTInterface alignment = parseBeast1XML(ID, xml);
				if (alignment != null) {
					alignment.setID(file.getName().substring(0, file.getName().length() - 4).replaceAll("\\..*", ""));
				}
				return alignment;
			} catch (Exception ex2) {
				ex.printStackTrace();
				JOptionPane.showMessageDialog(null, "Loading of " + file.getName() + " failed: " + ex.getMessage()
						+ "\n" + ex2.getMessage());
			}
			return null;
		}
	}
	

	private static BEASTInterface parseBeast1XML(String ID, String xml) throws SAXException, IOException, ParserConfigurationException  {
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
		Document doc = factory.newDocumentBuilder().parse(new InputSource(new StringReader(xml)));
		doc.normalize();

		NodeList alignments = doc.getElementsByTagName("alignment");
		Alignment alignment = new Alignment();
		alignment.dataTypeInput.setValue("nucleotide", alignment);

		// parse first alignment
		org.w3c.dom.Node node = alignments.item(0);

		String dataTypeName = node.getAttributes().getNamedItem("dataType").getNodeValue();
		int totalCount = 4;
		if (dataTypeName == null) {
			alignment.dataTypeInput.setValue("integer", alignment);
		} else if (dataTypeName.toLowerCase().equals("dna") || dataTypeName.toLowerCase().equals("nucleotide")) {
			alignment.dataTypeInput.setValue("nucleotide", alignment);
			totalCount = 4;
		} else if (dataTypeName.toLowerCase().equals("aminoacid") || dataTypeName.toLowerCase().equals("protein")) {
			alignment.dataTypeInput.setValue("aminoacid", alignment);
			totalCount = 20;
		} else {
			alignment.dataTypeInput.setValue("integer", alignment);
		}

		NodeList children = node.getChildNodes();
		for (int i = 0; i < children.getLength(); i++) {
			org.w3c.dom.Node child = children.item(i);
			if (child.getNodeName().equals("sequence")) {
				Sequence sequence = new Sequence();
				// find the taxon
				String taxon = "";
				NodeList sequenceChildren = child.getChildNodes();
				for (int j = 0; j < sequenceChildren.getLength(); j++) {
					org.w3c.dom.Node child2 = sequenceChildren.item(j);
					if (child2.getNodeName().equals("taxon")) {
						taxon = child2.getAttributes().getNamedItem("idref").getNodeValue();
					}
				}
				String data = child.getTextContent();
				sequence.initByName("totalcount", totalCount, "taxon", taxon, "value", data);
				sequence.setID("seq_" + taxon);
				alignment.sequenceInput.setValue(sequence, alignment);

			}
		}
		alignment.setID(ID);
		alignment.initAndValidate();
		return alignment;
	} // parseBeast1XML


	static BEASTInterface getAlignment(BEASTInterface beastObject) throws IllegalArgumentException, IllegalAccessException {
		if (beastObject instanceof Alignment) {
			return beastObject;
		}
		for (BEASTInterface beastObject2 : beastObject.listActiveBEASTObjects()) {
			beastObject2 = getAlignment(beastObject2);
			if (beastObject2 != null) {
				return beastObject2;
			}
		}
		return null;
	}

	@Description("NEXUS file importer")
	class NexusImporter implements AlignmentImporter {

		@Override
		public String[] getFileExtensions() {
			return new String[]{"nex","nxs","nexus"};
		}

		@Override
		public List<BEASTInterface> loadFile(File file) {
			List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
			NexusParser parser = new NexusParser();
			try {
				parser.parseFile(file);
				if (parser.filteredAlignments.size() > 0) {
					/**
					 * sanity check: make sure the filters do not
					 * overlap
					 **/
					int[] used = new int[parser.m_alignment.getSiteCount()];
					Set<Integer> overlap = new HashSet<>();
					int partitionNr = 1;
					for (Alignment data : parser.filteredAlignments) {
						int[] indices = ((FilteredAlignment) data).indices();
						for (int i : indices) {
							if (used[i] > 0) {
								overlap.add(used[i] * 10000 + partitionNr);
							} else {
								used[i] = partitionNr;
							}
						}
						partitionNr++;
					}
					if (overlap.size() > 0) {
						String overlaps = "<html>Warning: The following partitions overlap:<br/>";
						for (int i : overlap) {
							overlaps += parser.filteredAlignments.get(i / 10000 - 1).getID()
									+ " overlaps with "
									+ parser.filteredAlignments.get(i % 10000 - 1).getID() + "<br/>";
						}
						overlaps += "The first thing you might want to do is delete some of these partitions.</html>";
						JOptionPane.showMessageDialog(null, overlaps);
					}
					/** add alignments **/
					for (Alignment data : parser.filteredAlignments) {
						sortByTaxonName(data.sequenceInput.get());
						selectedBEASTObjects.add(data);
					}
					if (parser.calibrations != null) {
						selectedBEASTObjects.addAll(parser.calibrations);
					}
				} else {
					selectedBEASTObjects.add(parser.m_alignment);
					if (parser.calibrations != null) {
						selectedBEASTObjects.addAll(parser.calibrations);
					}
				}
			} catch (Exception ex) {
				ex.printStackTrace();
				JOptionPane.showMessageDialog(null, "Loading of " + file.getPath() + " failed: " + ex.getMessage());
				return null;
			}
			return selectedBEASTObjects;
		}
	}
	
	@Description("BEAST XML file importer")
	class XMLImporter implements AlignmentImporter {

		@Override
		public String[] getFileExtensions() {
			return new String[]{"xml"};
		}

		@Override
		public List<BEASTInterface> loadFile(File file) {
			List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
			Alignment alignment = (Alignment)getXMLData(file);
			selectedBEASTObjects.add(alignment);
			return selectedBEASTObjects;
		}
		
	}

	enum dtype { userdefined, aminoacid, nucleotide};
	@Description("Fasta file importer")
	class FastaImporter implements AlignmentImporter {
		dtype datatype = dtype.userdefined;
		
		public FastaImporter() {
			super();
			datatype = dtype.userdefined;
		}
		
		@Override
		public String[] getFileExtensions() {
			return new String[]{"fa","fas","fst","fasta","fna","ffn","faa","frn"};
		}

		@Override
		public List<BEASTInterface> loadFile(File file) {
			List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
		    	try {
		    		// grab alignment data
		        	Map<String, StringBuilder> seqMap = new HashMap<>();
		        	List<String> taxa = new ArrayList<>();
		        	String currentTaxon = null;
					BufferedReader fin = new BufferedReader(new FileReader(file));
			        String missing = "?";
			        String gap = "-";
			        int totalCount = 4;
			        String datatype = "nucleotide";
			        // According to http://en.wikipedia.org/wiki/FASTA_format lists file formats and their data content
					// .fna = nucleic acid
					// .ffn = nucleotide coding regions
					// .frn = non-coding RNA
					// .ffa = amino acid
		    		boolean mayBeAminoacid = !(file.getName().toLowerCase().endsWith(".fna") || file.getName().toLowerCase().endsWith(".ffn") || file.getName().toLowerCase().endsWith(".frn"));
		    		
					while (fin.ready()) {
						String line = fin.readLine();
						if (line.startsWith(";")) {
							// it is a comment, ignore
						} else 	if (line.startsWith(">")) {
							// it is a taxon
							currentTaxon = line.substring(1).trim();
							// only up to first space
							currentTaxon = currentTaxon.replaceAll("\\s.*$", "");
						} else {
							// it is a data line
							if (currentTaxon == null) {
								fin.close();
								throw new RuntimeException("Expected taxon defined on first line");
							}
							if (seqMap.containsKey(currentTaxon)) {
								StringBuilder sb = seqMap.get(currentTaxon);
								sb.append(line);
							} else {
								StringBuilder sb = new StringBuilder();
								seqMap.put(currentTaxon, sb);
								sb.append(line);
								taxa.add(currentTaxon);
							}
						}
					}
					fin.close();
					
					int charCount = -1;
					Alignment alignment = new Alignment();
			        for (final String taxon : taxa) {
			            final StringBuilder bsData = seqMap.get(taxon);
			            String data = bsData.toString();
			            data = data.replaceAll("\\s", "");
			            seqMap.put(taxon, new StringBuilder(data));

			            if (charCount < 0) {charCount = data.length();}
			            if (data.length() != charCount) {
			                throw new IllegalArgumentException("Expected sequence of length " + charCount + " instead of " + data.length() + " for taxon " + taxon);
			            }
			            // map to standard missing and gap chars
			            data = data.replace(missing.charAt(0), DataType.MISSING_CHAR);
			            data = data.replace(gap.charAt(0), DataType.GAP_CHAR);

			            if (mayBeAminoacid && datatype.equals("nucleotide") && 
			            		guessSequenceType(data).equals("aminoacid")) {
			            	datatype = "aminoacid";
			            	totalCount = 20;
			            	for (Sequence seq : alignment.sequenceInput.get()) {
			            		seq.totalCountInput.setValue(totalCount, seq);
			            	}
			            }
			            
			            final Sequence sequence = new Sequence();
			            data = data.replaceAll("[Xx]", "?");
			            sequence.init(totalCount, taxon, data);
			            sequence.setID(NexusParser.generateSequenceID(taxon));
			            alignment.sequenceInput.setValue(sequence, alignment);
			        }
			        String ID = file.getName();
			        ID = ID.substring(0, ID.lastIndexOf('.')).replaceAll("\\..*", "");
			        alignment.setID(ID);

			        if (mayBeAminoacid) {
			        	switch (this.datatype) {
				        	case userdefined:
				        		// make user choose 
					        	JComboBox<String> jcb = new JComboBox<>(new String[]{"aminoacid", "nucleotide", "all are aminoacid", "all are nucleotide"});
					        	jcb.setEditable(true);
					        	jcb.setSelectedItem(datatype);
					        	JOptionPane.showMessageDialog(null, jcb, "Choose the datatype of alignment " + alignment.getID(), JOptionPane.QUESTION_MESSAGE);
					        	switch ((String) jcb.getSelectedItem()) {
						        	case "aminoacid": datatype = "aminoacid"; totalCount = 20; break;
						        	case "nucleotide": datatype = "nucleotide"; totalCount = 4; break;
						        	case "all are aminoacid": datatype = "aminoacid"; this.datatype = dtype.aminoacid; totalCount = 20; break;
						        	case "all are nucleotide": datatype = "nucleotide"; this.datatype = dtype.nucleotide; totalCount = 4; break;
					        	}
					        	break;
				        	case aminoacid:
				        		datatype = "aminoacid";
				        		totalCount = 20;
				        		break;
				        	case nucleotide:
				        		datatype = "nucleotide";
				        		totalCount = 4;
			        	}
		            	for (Sequence seq : alignment.sequenceInput.get()) {
		            		seq.totalCountInput.setValue(totalCount, seq);
		            	}
			        }
					alignment.dataTypeInput.setValue(datatype, alignment);
			        alignment.initAndValidate();
			        selectedBEASTObjects.add(alignment);
		    	} catch (Exception e) {
					e.printStackTrace();
					JOptionPane.showMessageDialog(null, "Loading of " + file.getName() + " failed: " + e.getMessage());
		    	}
			return selectedBEASTObjects;
		}
		
	    /** Ported from jebl2
	     * Guess type of sequence from contents.
	     * @param seq the sequence
	     * @return SequenceType.NUCLEOTIDE or SequenceType.AMINO_ACID, if sequence is believed to be of that type.
	     *         If the sequence contains characters that are valid for neither of these two sequence
	     *         types, then this method returns null.
	     */
	    public String guessSequenceType(final String seq) {

	        int canonicalNucStates = 0;
	        int undeterminedStates = 0;
	        // true length, excluding any gaps
	        int sequenceLength = seq.length();
	        final int seqLen = sequenceLength;

	        boolean onlyValidNucleotides = true;
	        boolean onlyValidAminoAcids = true;

	        // do not use toCharArray: it allocates an array size of sequence
	        for(int k = 0; (k < seqLen) && (onlyValidNucleotides || onlyValidAminoAcids); ++k) {
	            final char c = seq.charAt(k);
	            final boolean isNucState = ("ACGTUXNacgtuxn?_-".indexOf(c) > -1);
	            final boolean isAminoState = true;

	            onlyValidNucleotides &= isNucState;
	            onlyValidAminoAcids &= isAminoState;

	            if (onlyValidNucleotides) {
	                assert(isNucState);
	                if (("ACGTacgt".indexOf(c) > -1)) {
	                    ++canonicalNucStates;
	                } else {
	                    if (("?_-".indexOf(c) > -1)) {
	                        --sequenceLength;
	                    } else if( ("UXNuxn".indexOf(c) > -1)) {
	                        ++undeterminedStates;
	                    }
	                }
	            }
	        }

	        String result = "aminoacid";
	        if (onlyValidNucleotides) {  // only nucleotide states
	            // All sites are nucleotides (actual or ambigoues). If longer than 100 sites, declare it a nuc
	            if( sequenceLength >= 100 ) {
	                result = "nucleotide";
	            } else {
	                // if short, ask for 70% of ACGT or N
	                final double threshold = 0.7;
	                final int nucStates = canonicalNucStates + undeterminedStates;
	                // note: This implicitely assumes that every valid nucleotide
	                // symbol is also a valid amino acid. This is true since we
	                // added support for the 21st amino acid, U (Selenocysteine)
	                // in AminoAcids.java.
	                result = nucStates >= sequenceLength * threshold ? "nucleotide" : "aminoacid";
	            }
	        } else if (onlyValidAminoAcids) {
	            result = "aminoacid";
	        } else {
	            result = null;
	        }
	        return result;
	    }

		
	}

}