package beast.app.beauti;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.swing.JComboBox;
import javax.swing.JOptionPane;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import beast.core.BEASTInterface;
import beast.core.BEASTObject;
import beast.core.Description;
import beast.core.Input;
import beast.core.Input.Validate;
import beast.evolution.alignment.Alignment;
import beast.evolution.alignment.FilteredAlignment;
import beast.evolution.alignment.Sequence;
import beast.evolution.datatype.DataType;
import beast.math.distributions.MRCAPrior;
import beast.util.AddOnManager;
import beast.util.NexusParser;
import beast.util.XMLParser;
@Description("Class for creating new alignments to be edited by AlignmentListInputEditor")
public class BeautiAlignmentProvider extends BEASTObject {
/** map extension to importer class names **/
static List<AlignmentImporter> importers = null;
/**
* directory to pick up importers from *
*/
final static String[] IMPLEMENTATION_DIR = {"beast.app"};
private void initImporters() {
importers = new ArrayList<>();
// add standard importers
importers.add(new NexusImporter());
importers.add(new XMLImporter());
importers.add(new FastaImporter());
// build up list of data types
List<String> importerClasses = AddOnManager.find(AlignmentImporter.class, IMPLEMENTATION_DIR);
for (String _class: importerClasses) {
try {
if (!_class.startsWith(this.getClass().getName())) {
AlignmentImporter importer = (AlignmentImporter) Class.forName(_class).newInstance();
importers.add(importer);
}
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
final public Input<BeautiSubTemplate> template = new Input<>("template", "template to be used after creating a new alignment. ", Validate.REQUIRED);
@Override
public void initAndValidate() {
}
/**
* return amount to which the provided matches an alignment
* The provider with the highest match will be used to edit the alignment
* */
protected int matches(Alignment alignment) {
return 1;
}
/**
* return new alignment, return null if not successful
* **/
protected List<BEASTInterface> getAlignments(BeautiDoc doc) {
if (importers == null) {
initImporters();
}
Set<String> extensions = new HashSet<>();
for (AlignmentImporter importer : importers) {
for (String extension : importer.getFileExtensions()) {
extensions.add(extension);
}
}
File [] files = beast.app.util.Utils.getLoadFiles("Load Alignment File",
new File(Beauti.g_sDir), "Alignment files", extensions.toArray(new String[]{}));
if (files != null && files.length > 0) {
return getAlignments(doc, files);
}
return null;
}
/**
* return new alignment given files
* @param doc
* @param files
* @return
*/
public List<BEASTInterface> getAlignments(BeautiDoc doc, File[] files) {
if (files == null) {
// merge "+ button" and "drag drop" function
return getAlignments(doc);
}
if (importers == null) {
initImporters();
}
List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
List<MRCAPrior> calibrations = new ArrayList<>();
for (File file : files) {
// create list of importers that can handle the file
List<AlignmentImporter> availableImporters = new ArrayList<>();
for (AlignmentImporter importer : importers) {
if (importer.canHandleFile(file)) {
availableImporters.add(importer);
}
}
if (availableImporters.size() > 0) {
AlignmentImporter importer = availableImporters.get(0);
if (availableImporters.size() > 1) {
// let user choose an importer
List<String> descriptions = new ArrayList<>();
for (AlignmentImporter i : availableImporters) {
descriptions.add(((BEASTInterface)i).getDescription());
}
String option = (String)JOptionPane.showInputDialog(null, "Which importer is appropriate", "Option",
JOptionPane.WARNING_MESSAGE, null, descriptions.toArray(), descriptions.get(0));
if (option == null) {
return selectedBEASTObjects;
}
int i = descriptions.indexOf(option);
importer = availableImporters.get(i);
}
// get a fresh instance
//try {
// importer = importer.getClass().newInstance();
//} catch (InstantiationException | IllegalAccessException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
//}
List<BEASTInterface> list = importer.loadFile(file);
selectedBEASTObjects.addAll(list);
} else {
JOptionPane.showMessageDialog(null,
"Unsupported sequence file.",
"Error", JOptionPane.ERROR_MESSAGE);
}
}
addAlignments(doc, selectedBEASTObjects);
if (calibrations != null) {
selectedBEASTObjects.addAll(calibrations);
}
// doc.addMRCAPriors(calibrations);
return selectedBEASTObjects;
}
/** this allows subclasses of BeautiAlignmentProvider to be called with pre-defined arguments
* for example from a scripting environment (see CompactAnalysis in BEASTLabs). The subclass
* can choose to suppress GUI components.
* Typical usage is for importing alignments using a standard template.
*/
public List<BEASTInterface> getAlignments(BeautiDoc doc, File[] files, String [] args) {
List<BEASTInterface> selectedBEASTObjects = getAlignments(doc, files);
return selectedBEASTObjects;
}
protected void addAlignments(BeautiDoc doc, List<BEASTInterface> selectedBEASTObjects) {
for (BEASTInterface beastObject : selectedBEASTObjects) {
if (beastObject instanceof Alignment) {
// ensure ID of alignment is unique
int k = 0;
String id = beastObject.getID();
boolean found = true;
while (doc.pluginmap.containsKey(id) && found) {
found = false;
for (Alignment data : doc.alignments) {
if (data.getID().equals(beastObject.getID())) {
found = true;
break;
}
}
if (found) {
k++;
id = beastObject.getID() + k;
} else {
BEASTInterface oldData = doc.pluginmap.get(beastObject.getID());
replaceItem(doc, oldData, beastObject);
}
}
beastObject.setID(id);
sortByTaxonName(((Alignment) beastObject).sequenceInput.get());
if (getStartTemplate() != null) {
doc.addAlignmentWithSubnet((Alignment) beastObject, getStartTemplate());
}
}
}
}
@SuppressWarnings({ "rawtypes", "unchecked" })
private void replaceItem(BeautiDoc doc, BEASTInterface oldData, BEASTInterface newData) {
doc.pluginmap.remove(newData.getID());
Set<BEASTInterface> outputs = new LinkedHashSet<>();
outputs.addAll(oldData.getOutputs());
for (BEASTInterface o : outputs) {
for ( Input i : o.listInputs()) {
if (i.get() == oldData) {
i.setValue(newData, o);
} else if (i.get() != null && i.get() instanceof List) {
List list = (List) i.get();
int index = list.indexOf(oldData);
if (index >= 0) {
list.set(index, newData);
newData.getOutputs().add(o);
}
}
}
}
}
/** provide GUI for manipulating the alignment **/
void editAlignment(Alignment alignment, BeautiDoc doc) {
try {
AlignmentViewer viewer = new AlignmentViewer(alignment);
viewer.showInDialog();
} catch (Exception e) {
JOptionPane.showMessageDialog(null, "Something went wrong viewing the alignment: " + e.getMessage());
e.printStackTrace();
}
}
/** check validity of the alignment,
* return null if there are no problens,
* return message string if something is fishy **/
String validateAlignment() {
return null;
}
/** return template to apply to this new alignment.
* By default, the partition template of the current beauti template is returned **/
protected BeautiSubTemplate getStartTemplate() {
return template.get();
}
protected void sortByTaxonName(List<Sequence> seqs) {
Collections.sort(seqs, (Sequence o1, Sequence o2) -> {
return o1.taxonInput.get().compareTo(o2.taxonInput.get());
}
);
}
static public BEASTInterface getXMLData(File file) {
String xml = "";
try {
// parse as BEAST 2 xml fragment
XMLParser parser = new XMLParser();
BufferedReader fin = new BufferedReader(new FileReader(file));
while (fin.ready()) {
xml += fin.readLine() + "\n";
}
fin.close();
BEASTInterface runnable = parser.parseBareFragment(xml, false);
BEASTInterface alignment = getAlignment(runnable);
alignment.initAndValidate();
return alignment;
} catch (Exception ex) {
// attempt to parse as BEAST 1 xml
try {
String ID = file.getName();
ID = ID.substring(0, ID.lastIndexOf('.')).replaceAll("\\..*", "");
BEASTInterface alignment = parseBeast1XML(ID, xml);
if (alignment != null) {
alignment.setID(file.getName().substring(0, file.getName().length() - 4).replaceAll("\\..*", ""));
}
return alignment;
} catch (Exception ex2) {
ex.printStackTrace();
JOptionPane.showMessageDialog(null, "Loading of " + file.getName() + " failed: " + ex.getMessage()
+ "\n" + ex2.getMessage());
}
return null;
}
}
private static BEASTInterface parseBeast1XML(String ID, String xml) throws SAXException, IOException, ParserConfigurationException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
Document doc = factory.newDocumentBuilder().parse(new InputSource(new StringReader(xml)));
doc.normalize();
NodeList alignments = doc.getElementsByTagName("alignment");
Alignment alignment = new Alignment();
alignment.dataTypeInput.setValue("nucleotide", alignment);
// parse first alignment
org.w3c.dom.Node node = alignments.item(0);
String dataTypeName = node.getAttributes().getNamedItem("dataType").getNodeValue();
int totalCount = 4;
if (dataTypeName == null) {
alignment.dataTypeInput.setValue("integer", alignment);
} else if (dataTypeName.toLowerCase().equals("dna") || dataTypeName.toLowerCase().equals("nucleotide")) {
alignment.dataTypeInput.setValue("nucleotide", alignment);
totalCount = 4;
} else if (dataTypeName.toLowerCase().equals("aminoacid") || dataTypeName.toLowerCase().equals("protein")) {
alignment.dataTypeInput.setValue("aminoacid", alignment);
totalCount = 20;
} else {
alignment.dataTypeInput.setValue("integer", alignment);
}
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
org.w3c.dom.Node child = children.item(i);
if (child.getNodeName().equals("sequence")) {
Sequence sequence = new Sequence();
// find the taxon
String taxon = "";
NodeList sequenceChildren = child.getChildNodes();
for (int j = 0; j < sequenceChildren.getLength(); j++) {
org.w3c.dom.Node child2 = sequenceChildren.item(j);
if (child2.getNodeName().equals("taxon")) {
taxon = child2.getAttributes().getNamedItem("idref").getNodeValue();
}
}
String data = child.getTextContent();
sequence.initByName("totalcount", totalCount, "taxon", taxon, "value", data);
sequence.setID("seq_" + taxon);
alignment.sequenceInput.setValue(sequence, alignment);
}
}
alignment.setID(ID);
alignment.initAndValidate();
return alignment;
} // parseBeast1XML
static BEASTInterface getAlignment(BEASTInterface beastObject) throws IllegalArgumentException, IllegalAccessException {
if (beastObject instanceof Alignment) {
return beastObject;
}
for (BEASTInterface beastObject2 : beastObject.listActiveBEASTObjects()) {
beastObject2 = getAlignment(beastObject2);
if (beastObject2 != null) {
return beastObject2;
}
}
return null;
}
@Description("NEXUS file importer")
class NexusImporter implements AlignmentImporter {
@Override
public String[] getFileExtensions() {
return new String[]{"nex","nxs","nexus"};
}
@Override
public List<BEASTInterface> loadFile(File file) {
List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
NexusParser parser = new NexusParser();
try {
parser.parseFile(file);
if (parser.filteredAlignments.size() > 0) {
/**
* sanity check: make sure the filters do not
* overlap
**/
int[] used = new int[parser.m_alignment.getSiteCount()];
Set<Integer> overlap = new HashSet<>();
int partitionNr = 1;
for (Alignment data : parser.filteredAlignments) {
int[] indices = ((FilteredAlignment) data).indices();
for (int i : indices) {
if (used[i] > 0) {
overlap.add(used[i] * 10000 + partitionNr);
} else {
used[i] = partitionNr;
}
}
partitionNr++;
}
if (overlap.size() > 0) {
String overlaps = "<html>Warning: The following partitions overlap:<br/>";
for (int i : overlap) {
overlaps += parser.filteredAlignments.get(i / 10000 - 1).getID()
+ " overlaps with "
+ parser.filteredAlignments.get(i % 10000 - 1).getID() + "<br/>";
}
overlaps += "The first thing you might want to do is delete some of these partitions.</html>";
JOptionPane.showMessageDialog(null, overlaps);
}
/** add alignments **/
for (Alignment data : parser.filteredAlignments) {
sortByTaxonName(data.sequenceInput.get());
selectedBEASTObjects.add(data);
}
if (parser.calibrations != null) {
selectedBEASTObjects.addAll(parser.calibrations);
}
} else {
selectedBEASTObjects.add(parser.m_alignment);
if (parser.calibrations != null) {
selectedBEASTObjects.addAll(parser.calibrations);
}
}
} catch (Exception ex) {
ex.printStackTrace();
JOptionPane.showMessageDialog(null, "Loading of " + file.getPath() + " failed: " + ex.getMessage());
return null;
}
return selectedBEASTObjects;
}
}
@Description("BEAST XML file importer")
class XMLImporter implements AlignmentImporter {
@Override
public String[] getFileExtensions() {
return new String[]{"xml"};
}
@Override
public List<BEASTInterface> loadFile(File file) {
List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
Alignment alignment = (Alignment)getXMLData(file);
selectedBEASTObjects.add(alignment);
return selectedBEASTObjects;
}
}
enum dtype { userdefined, aminoacid, nucleotide};
@Description("Fasta file importer")
class FastaImporter implements AlignmentImporter {
dtype datatype = dtype.userdefined;
public FastaImporter() {
super();
datatype = dtype.userdefined;
}
@Override
public String[] getFileExtensions() {
return new String[]{"fa","fas","fst","fasta","fna","ffn","faa","frn"};
}
@Override
public List<BEASTInterface> loadFile(File file) {
List<BEASTInterface> selectedBEASTObjects = new ArrayList<>();
try {
// grab alignment data
Map<String, StringBuilder> seqMap = new HashMap<>();
List<String> taxa = new ArrayList<>();
String currentTaxon = null;
BufferedReader fin = new BufferedReader(new FileReader(file));
String missing = "?";
String gap = "-";
int totalCount = 4;
String datatype = "nucleotide";
// According to http://en.wikipedia.org/wiki/FASTA_format lists file formats and their data content
// .fna = nucleic acid
// .ffn = nucleotide coding regions
// .frn = non-coding RNA
// .ffa = amino acid
boolean mayBeAminoacid = !(file.getName().toLowerCase().endsWith(".fna") || file.getName().toLowerCase().endsWith(".ffn") || file.getName().toLowerCase().endsWith(".frn"));
while (fin.ready()) {
String line = fin.readLine();
if (line.startsWith(";")) {
// it is a comment, ignore
} else if (line.startsWith(">")) {
// it is a taxon
currentTaxon = line.substring(1).trim();
// only up to first space
currentTaxon = currentTaxon.replaceAll("\\s.*$", "");
} else {
// it is a data line
if (currentTaxon == null) {
fin.close();
throw new RuntimeException("Expected taxon defined on first line");
}
if (seqMap.containsKey(currentTaxon)) {
StringBuilder sb = seqMap.get(currentTaxon);
sb.append(line);
} else {
StringBuilder sb = new StringBuilder();
seqMap.put(currentTaxon, sb);
sb.append(line);
taxa.add(currentTaxon);
}
}
}
fin.close();
int charCount = -1;
Alignment alignment = new Alignment();
for (final String taxon : taxa) {
final StringBuilder bsData = seqMap.get(taxon);
String data = bsData.toString();
data = data.replaceAll("\\s", "");
seqMap.put(taxon, new StringBuilder(data));
if (charCount < 0) {charCount = data.length();}
if (data.length() != charCount) {
throw new IllegalArgumentException("Expected sequence of length " + charCount + " instead of " + data.length() + " for taxon " + taxon);
}
// map to standard missing and gap chars
data = data.replace(missing.charAt(0), DataType.MISSING_CHAR);
data = data.replace(gap.charAt(0), DataType.GAP_CHAR);
if (mayBeAminoacid && datatype.equals("nucleotide") &&
guessSequenceType(data).equals("aminoacid")) {
datatype = "aminoacid";
totalCount = 20;
for (Sequence seq : alignment.sequenceInput.get()) {
seq.totalCountInput.setValue(totalCount, seq);
}
}
final Sequence sequence = new Sequence();
data = data.replaceAll("[Xx]", "?");
sequence.init(totalCount, taxon, data);
sequence.setID(NexusParser.generateSequenceID(taxon));
alignment.sequenceInput.setValue(sequence, alignment);
}
String ID = file.getName();
ID = ID.substring(0, ID.lastIndexOf('.')).replaceAll("\\..*", "");
alignment.setID(ID);
if (mayBeAminoacid) {
switch (this.datatype) {
case userdefined:
// make user choose
JComboBox<String> jcb = new JComboBox<>(new String[]{"aminoacid", "nucleotide", "all are aminoacid", "all are nucleotide"});
jcb.setEditable(true);
jcb.setSelectedItem(datatype);
JOptionPane.showMessageDialog(null, jcb, "Choose the datatype of alignment " + alignment.getID(), JOptionPane.QUESTION_MESSAGE);
switch ((String) jcb.getSelectedItem()) {
case "aminoacid": datatype = "aminoacid"; totalCount = 20; break;
case "nucleotide": datatype = "nucleotide"; totalCount = 4; break;
case "all are aminoacid": datatype = "aminoacid"; this.datatype = dtype.aminoacid; totalCount = 20; break;
case "all are nucleotide": datatype = "nucleotide"; this.datatype = dtype.nucleotide; totalCount = 4; break;
}
break;
case aminoacid:
datatype = "aminoacid";
totalCount = 20;
break;
case nucleotide:
datatype = "nucleotide";
totalCount = 4;
}
for (Sequence seq : alignment.sequenceInput.get()) {
seq.totalCountInput.setValue(totalCount, seq);
}
}
alignment.dataTypeInput.setValue(datatype, alignment);
alignment.initAndValidate();
selectedBEASTObjects.add(alignment);
} catch (Exception e) {
e.printStackTrace();
JOptionPane.showMessageDialog(null, "Loading of " + file.getName() + " failed: " + e.getMessage());
}
return selectedBEASTObjects;
}
/** Ported from jebl2
* Guess type of sequence from contents.
* @param seq the sequence
* @return SequenceType.NUCLEOTIDE or SequenceType.AMINO_ACID, if sequence is believed to be of that type.
* If the sequence contains characters that are valid for neither of these two sequence
* types, then this method returns null.
*/
public String guessSequenceType(final String seq) {
int canonicalNucStates = 0;
int undeterminedStates = 0;
// true length, excluding any gaps
int sequenceLength = seq.length();
final int seqLen = sequenceLength;
boolean onlyValidNucleotides = true;
boolean onlyValidAminoAcids = true;
// do not use toCharArray: it allocates an array size of sequence
for(int k = 0; (k < seqLen) && (onlyValidNucleotides || onlyValidAminoAcids); ++k) {
final char c = seq.charAt(k);
final boolean isNucState = ("ACGTUXNacgtuxn?_-".indexOf(c) > -1);
final boolean isAminoState = true;
onlyValidNucleotides &= isNucState;
onlyValidAminoAcids &= isAminoState;
if (onlyValidNucleotides) {
assert(isNucState);
if (("ACGTacgt".indexOf(c) > -1)) {
++canonicalNucStates;
} else {
if (("?_-".indexOf(c) > -1)) {
--sequenceLength;
} else if( ("UXNuxn".indexOf(c) > -1)) {
++undeterminedStates;
}
}
}
}
String result = "aminoacid";
if (onlyValidNucleotides) { // only nucleotide states
// All sites are nucleotides (actual or ambigoues). If longer than 100 sites, declare it a nuc
if( sequenceLength >= 100 ) {
result = "nucleotide";
} else {
// if short, ask for 70% of ACGT or N
final double threshold = 0.7;
final int nucStates = canonicalNucStates + undeterminedStates;
// note: This implicitely assumes that every valid nucleotide
// symbol is also a valid amino acid. This is true since we
// added support for the 21st amino acid, U (Selenocysteine)
// in AminoAcids.java.
result = nucStates >= sequenceLength * threshold ? "nucleotide" : "aminoacid";
}
} else if (onlyValidAminoAcids) {
result = "aminoacid";
} else {
result = null;
}
return result;
}
}
}