package org.phenoscape.io; import java.io.File; import java.io.IOException; import java.io.Reader; import java.net.URI; import java.net.URISyntaxException; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import javax.xml.namespace.QName; import org.apache.log4j.Logger; import org.apache.xmlbeans.XmlAnySimpleType; import org.apache.xmlbeans.XmlException; import org.bioontologies.obd.schema.pheno.PhenotypeDocument; import org.nexml.schema_2009.AbstractBlock; import org.nexml.schema_2009.AbstractChar; import org.nexml.schema_2009.AbstractMapping; import org.nexml.schema_2009.AbstractObs; import org.nexml.schema_2009.AbstractObsMatrix; import org.nexml.schema_2009.AbstractObsRow; import org.nexml.schema_2009.AbstractState; import org.nexml.schema_2009.AbstractStates; import org.nexml.schema_2009.AbstractUncertainStateSet; import org.nexml.schema_2009.Annotated; import org.nexml.schema_2009.Nexml; import org.nexml.schema_2009.NexmlDocument; import org.nexml.schema_2009.StandardCells; import org.nexml.schema_2009.StandardChar; import org.nexml.schema_2009.StandardFormat; import org.nexml.schema_2009.StandardStates; import org.nexml.schema_2009.Taxa; import org.obo.datamodel.DanglingObject; import org.obo.datamodel.IdentifiedObject; import org.obo.datamodel.OBOClass; import org.obo.datamodel.OBOSession; import org.obo.datamodel.ObsoletableObject; import org.obo.datamodel.impl.DanglingClassImpl; import org.phenoscape.io.NeXMLUtil.LiteralContents; import org.phenoscape.io.NeXMLUtil.OBOURISyntaxException; import org.phenoscape.model.Association; import org.phenoscape.model.AssociationSupport; import org.phenoscape.model.Character; import org.phenoscape.model.DataSet; import org.phenoscape.model.MultipleState; import org.phenoscape.model.MultipleState.MODE; import org.phenoscape.model.Phenotype; import org.phenoscape.model.Specimen; import org.phenoscape.model.State; import org.phenoscape.model.Taxon; import org.w3c.dom.Attr; import org.w3c.dom.Element; import org.w3c.dom.NodeList; public class NeXMLReader { private final DataSet data = new DataSet(); private final NexmlDocument xmlDoc; private final OBOSession session; private final Set<String> danglers = new HashSet<String>(); private final Set<String> secondaryIDs = new HashSet<String>(); private final Set<String> replacedIDs = new HashSet<String>(); private String charactersBlockID = UUID.randomUUID().toString(); private final Map<String, State> allStates = new HashMap<String, State>(); public NeXMLReader(File aFile, OBOSession session) throws XmlException, IOException { this.session = session; this.xmlDoc = NexmlDocument.Factory.parse(aFile); this.parseNeXML(); } public NeXMLReader(Reader aReader, OBOSession session) throws XmlException, IOException { this.session = session; this.xmlDoc = NexmlDocument.Factory.parse(aReader); this.parseNeXML(); } public DataSet getDataSet() { return this.data; } public NexmlDocument getXMLDoc() { return this.xmlDoc; } public String getCharactersBlockID() { return this.charactersBlockID; } /** * Returns true if the reader had to create dangling terms for referenced IDs not found in the OBOSession. */ public boolean didCreateDanglers() { return !this.danglers.isEmpty(); } /** * Returns the list of IDs referenced in the file that were not found in the OBOSession. */ public Collection<String> getDanglersList() { return this.danglers; } /** * Returns true if the reader had to find any referenced terms via a secondary ID. */ public boolean didMigrateSecondaryIDs() { return !this.secondaryIDs.isEmpty(); } /** * Returns the list of IDs referenced in the file that were found to be secondary IDs. */ public Collection<String> getMigratedSecondaryIDsList() { return this.secondaryIDs; } public boolean didReplaceObsoleteTerms() { return !this.replacedIDs.isEmpty(); } public Collection<String> getReplacedIDsList() { return this.replacedIDs; } private void parseNeXML() { this.parseMetadata(this.xmlDoc.getNexml()); for (AbstractBlock block : this.xmlDoc.getNexml().getCharactersArray()) { if (block instanceof StandardCells) { this.charactersBlockID = block.getId(); final StandardCells cells = (StandardCells)block; this.parseStandardCells(cells); final Taxa taxa = NeXMLUtil.findOrCreateTaxa(this.xmlDoc, cells.getOtus()); this.parseTaxa(taxa); final AbstractObsMatrix abstractMatrix = cells.getMatrix(); if (abstractMatrix != null) { this.parseMatrix(abstractMatrix); } break; } } } private void parseStandardCells(StandardCells standardCells) { if (!(standardCells.getFormat() instanceof StandardFormat)) return; final StandardFormat format = (StandardFormat)(standardCells.getFormat()); for (AbstractChar abstractChar : format.getCharArray()) { if (!(abstractChar instanceof StandardChar)) continue; final StandardChar standardChar = (StandardChar)abstractChar; final Character newCharacter; if (standardChar.getStates() != null) { newCharacter = new Character(standardChar.getId(), standardChar.getStates()); } else { newCharacter = new Character(standardChar.getId()); } newCharacter.setLabel(standardChar.getLabel()); newCharacter.setComment(this.getComment(standardChar)); newCharacter.setFigure(this.getFigure(standardChar)); newCharacter.setDiscussion(this.getDiscussion(standardChar)); newCharacter.setDenotes(this.getDenotes(standardChar)); final AbstractStates states = NeXMLUtil.findOrCreateStates(format, newCharacter.getStatesNexmlID()); if (states instanceof StandardStates) { for (AbstractState abstractState : states.getStateArray()) { final State newState = new State(abstractState.getId()); newState.setSymbol(this.readSymbol(abstractState)); newState.setLabel(abstractState.getLabel()); newState.setComment(this.getComment(abstractState)); newState.setFigure(this.getFigure(abstractState)); this.allStates.put(newState.getNexmlID(), newState); final Object phenotypeObj = NeXMLUtil.getFirstMetadataValue(abstractState, NeXMLUtil.PHENOTYPE_PREDICATE); if (phenotypeObj instanceof LiteralContents) { final LiteralContents literal = (LiteralContents)phenotypeObj; // we need to get the last PhenoXML element that is different from the first, due to a now fixed bug which caused // phenotypes to be appended in files, instead of replaced // this will allow it to read in the latest work before destroying the unnecessary elements upon save final NodeList phenotypeElements = literal.getElement().getElementsByTagNameNS(NeXMLUtil.PHENOXML_NAMESPACE, "phenotype"); boolean first = true; List<Phenotype> firstPhenotypeList = null; List<Phenotype> newestPhenotypeList = null; for (int i = 0; i < phenotypeElements.getLength(); i++) { final Element phenoXML = (Element)(phenotypeElements.item(i)); try { final PhenotypeDocument xmlPhen = org.bioontologies.obd.schema.pheno.PhenotypeDocument.Factory.parse(phenoXML); final PhenoXMLAdapter adapter = new PhenoXMLAdapter(this.session); final List<Phenotype> phenotypes = adapter.parsePhenotype(xmlPhen.getPhenotype()); if (first) { first = false; firstPhenotypeList = phenotypes; newestPhenotypeList = phenotypes; } else { if (!phenotypes.equals(firstPhenotypeList)) { newestPhenotypeList = phenotypes; } } this.danglers.addAll(adapter.getDanglersList()); this.secondaryIDs.addAll(adapter.getMigratedSecondaryIDsList()); this.replacedIDs.addAll(adapter.getReplacedIDsList()); } catch (XmlException e) { log().error("Failed to parse OBO phenotype", e); } } if (newestPhenotypeList != null) { for (Phenotype phenotype : newestPhenotypeList) { newState.addPhenotype(phenotype); } } } newCharacter.addState(newState); } for (AbstractUncertainStateSet set : states.getUncertainStateSetArray()) { this.createMultiState(set, MODE.UNCERTAIN); } for (AbstractUncertainStateSet set : states.getPolymorphicStateSetArray()) { this.createMultiState(set, MODE.POLYMORPHIC); } } this.data.addCharacter(newCharacter); } } private MultipleState createMultiState(AbstractUncertainStateSet set, MODE mode) { log().debug("Creating multistate: " + set); final Set<State> memberStates = new HashSet<State>(); for (AbstractMapping mapping : set.getMemberArray()) { memberStates.add(this.allStates.get(mapping.getState())); } final MultipleState state = new MultipleState(set.getId(), memberStates, mode); this.allStates.put(state.getNexmlID(), state); return state; } /** * The NeXML schema forces symbols to be an integer, but I * believe this is incorrect. This method should allow us * to read any kind of symbol. */ private String readSymbol(AbstractState state) { final XmlAnySimpleType symbol = state.getSymbol(); final Attr attribute = (Attr)(symbol.getDomNode()); return attribute.getValue(); } private void parseMatrix(AbstractObsMatrix matrix) { final Map<String, Map<String, State>> matrixMap = new HashMap<String, Map<String, State>>(); for (AbstractObsRow row : matrix.getRowArray()) { final String otuID = row.getOtu(); if (otuID != null) { final Map<String, State> currentTaxonMap = new HashMap<String, State>(); matrixMap.put(otuID, currentTaxonMap); for (AbstractObs cell : row.getCellArray()) { final String characterID = cell.getChar() != null ? cell.getChar().getStringValue() : null; final String stateID = cell.getState() != null ? cell.getState().getStringValue() : null; final State state = this.allStates.get(stateID); if (characterID != null && state != null) { currentTaxonMap.put(characterID, state); final List<Object> suppportMetas = NeXMLUtil.getMetadataValues(cell, NeXMLUtil.ENTAILED_BY_PREDICATE); for (Object supportMeta : suppportMetas) { if (supportMeta instanceof Map<?,?>) { @SuppressWarnings("unchecked") final Map<QName, List<Object>> map = (Map<QName, List<Object>>)supportMeta; if (map.containsKey(NeXMLUtil.DC_IDENTIFIER) && map.containsKey(NeXMLUtil.DC_DESCRIPTION_PREDICATE) && map.containsKey(NeXMLUtil.DC_SOURCE_PREDICATE)) { final String identifier = stringOrNull(NeXMLUtil.first(map.get(NeXMLUtil.DC_IDENTIFIER))); final String description = stringOrNull(NeXMLUtil.first(map.get(NeXMLUtil.DC_DESCRIPTION_PREDICATE))); final String source = stringOrNull(NeXMLUtil.first(map.get(NeXMLUtil.DC_SOURCE_PREDICATE))); final boolean direct = Boolean.parseBoolean(stringOrNull(NeXMLUtil.first(map.get(NeXMLUtil.IS_DIRECT_PREDICATE)))); final AssociationSupport associationSupport = new AssociationSupport(description, source, direct); final Association association = new Association(otuID, characterID, identifier); final Set<AssociationSupport> supports; if (this.data.getAssociationSupport().containsKey(association)) { supports = this.data.getAssociationSupport().get(association); } else { supports = new HashSet<AssociationSupport>(); this.data.getAssociationSupport().put(association, supports); } supports.add(associationSupport); } } } } } } } this.data.setMatrixData(matrixMap); } private void parseMetadata(Nexml nexml) { final Object curatorsObj = NeXMLUtil.getFirstMetadataValue(nexml, NeXMLUtil.CURATORS_PREDICATE); this.data.setCurators(stringOrNull(curatorsObj)); final Object pubSourceObj = NeXMLUtil.getFirstMetadataValue(nexml, NeXMLUtil.PUBLICATION_SOURCE_PREDICATE); if (pubSourceObj instanceof Map<?,?>) { @SuppressWarnings("unchecked") final Map<QName, List<Object>> map = (Map<QName, List<Object>>)pubSourceObj; if (map.containsKey(NeXMLUtil.PUBLICATION_LABEL_PREDICATE)) { final List<Object> labelList = map.get(NeXMLUtil.PUBLICATION_LABEL_PREDICATE); this.data.setPublicationLabel(stringOrNull(NeXMLUtil.first(labelList))); } if (map.containsKey(NeXMLUtil.PUBLICATION_CITATION_PREDICATE)) { final List<Object> citationList = map.get(NeXMLUtil.PUBLICATION_CITATION_PREDICATE); this.data.setPublicationCitation(stringOrNull(NeXMLUtil.first(citationList))); } if (map.containsKey(NeXMLUtil.PUBLICATION_URI_PREDICATE)) { final List<Object> uriList = map.get(NeXMLUtil.PUBLICATION_URI_PREDICATE); this.data.setPublicationURI(stringOrNull(NeXMLUtil.first(uriList))); } } final Object pubNotesObj = NeXMLUtil.getFirstMetadataValue(nexml, NeXMLUtil.DC_DESCRIPTION_PREDICATE); this.data.setPublicationNotes(stringOrNull(pubNotesObj)); } @SuppressWarnings("unchecked") private void parseTaxa(Taxa taxa) { for (org.nexml.schema_2009.Taxon xmlTaxon : taxa.getOtuArray()) { final Taxon newTaxon = new Taxon(xmlTaxon.getId()); newTaxon.setPublicationName((xmlTaxon.getLabel() == null || xmlTaxon.getLabel().equals("")) ? null : xmlTaxon.getLabel()); final Object validNameObj = NeXMLUtil.getFirstMetadataValue(xmlTaxon, NeXMLUtil.VALID_NAME_PREDICATE); if (validNameObj != null) { try { final String validNameID = NeXMLUtil.oboID(new URI(validNameObj.toString())); newTaxon.setValidName(this.getTerm(validNameID)); } catch (OBOURISyntaxException e) { log().error("Value for taxon ID is not a valid OBO URI", e); } catch (URISyntaxException e) { log().error("Value for taxon ID is not a valid URI", e); } } newTaxon.setComment(this.getComment(xmlTaxon)); newTaxon.setFigure(this.getFigure(xmlTaxon)); newTaxon.setMatrixTaxonName(this.getMatrixTaxon(xmlTaxon)); final List<Object> specimens = NeXMLUtil.getMetadataValues(xmlTaxon, NeXMLUtil.SPECIMEN_PREDICATE); for (Object specimenData : specimens) { if (specimenData instanceof Map<?,?>) { final Map<QName, List<Object>> map = (Map<QName, List<Object>>)specimenData; final Specimen newSpecimen = newTaxon.newSpecimen(); if (map.containsKey(NeXMLUtil.COLLECTION_PREDICATE)) { final List<Object> collectionIDList = map.get(NeXMLUtil.COLLECTION_PREDICATE); try { final String collectionIDURI = this.stringOrNull(NeXMLUtil.first(collectionIDList)); if (collectionIDURI != null) { final String collectionID = NeXMLUtil.oboID(new URI(collectionIDURI)); newSpecimen.setCollectionCode(this.getTerm(collectionID)); } } catch (OBOURISyntaxException e) { log().error("Value for collection ID is not a valid OBO URI", e); } catch (URISyntaxException e) { log().error("Value for collection ID is not a valid URI", e); } } if (map.containsKey(NeXMLUtil.ACCESSION_PREDICATE)) { final List<Object> accessionList = map.get(NeXMLUtil.ACCESSION_PREDICATE); newSpecimen.setCatalogID(stringOrNull(NeXMLUtil.first(accessionList))); } if (map.containsKey(NeXMLUtil.COMMENT_PREDICATE)) { final List<Object> commentList = map.get(NeXMLUtil.COMMENT_PREDICATE); newSpecimen.setComment(stringOrNull(NeXMLUtil.first(commentList))); } } } this.data.addTaxon(newTaxon); } } private String getComment(Annotated node) { final Object comment = NeXMLUtil.getFirstMetadataValue(node, NeXMLUtil.COMMENT_PREDICATE); return stringOrNull(comment); } private String getDiscussion(Annotated node) { final Object comment = NeXMLUtil.getFirstMetadataValue(node, NeXMLUtil.DISCUSSION_PREDICATE); return stringOrNull(comment); } private URI getDenotes(Annotated node) { final String term = stringOrNull(NeXMLUtil.getFirstMetadataValue(node, NeXMLUtil.DENOTES_PREDICATE)); if (term != null) { return URI.create(term); } else { return null; } } private String getFigure(Annotated node) { final Object figure = NeXMLUtil.getFirstMetadataValue(node, NeXMLUtil.FIGURE_PREDICATE); return stringOrNull(figure); } private String getMatrixTaxon(Annotated node) { final Object matrixName = NeXMLUtil.getFirstMetadataValue(node, NeXMLUtil.MATRIX_NAME_PREDICATE); return stringOrNull(matrixName); } private OBOClass getTerm(String id) { log().debug("Term id: " + id); if (id.equals("http://purl.bioontology.org/ontology/provisional/d0267b99-ff52-4a4c-bd31-ff6dbf4dafcc")) { log().debug("Provisional term"); } final IdentifiedObject term = this.session.getObject(id); if (term instanceof OBOClass) { final OBOClass oboClass = (OBOClass)term; if (oboClass.isObsolete()) { if (!oboClass.getReplacedBy().isEmpty()) { final ObsoletableObject replacement = oboClass.getReplacedBy().iterator().next(); if ((replacement instanceof OBOClass) && (!(replacement instanceof DanglingObject))) { this.replacedIDs.add(id); return (OBOClass)replacement; } else { return oboClass; } } else { return oboClass; } } else { return oboClass; } } else { final OBOClass altTerm = this.findTermByAltID(id); if (altTerm != null) { return altTerm; } else { log().warn("Term not found; creating dangler for " + id); this.danglers.add(id); final OBOClass dangler = new DanglingClassImpl(id.trim()); return dangler; } } } private OBOClass findTermByAltID(String id) { log().debug("Called alt_id search"); final Collection<IdentifiedObject> terms = this.session.getObjects(); for (IdentifiedObject object : terms) { if (object instanceof OBOClass) { final OBOClass term = (OBOClass)object; if (term.getSecondaryIDs().contains(id)) { this.secondaryIDs.add(id); return term; } } } return null; } private String stringOrNull(Object obj) { if (obj == null) { return null; } else { final String string = obj.toString(); if (string.trim().length() < 1) { return null; } else { return string; } } } private Logger log() { return Logger.getLogger(this.getClass()); } }