package uk.ac.ebi.ep.parser.parsers;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import org.apache.log4j.Logger;
import org.springframework.util.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import uk.ac.ebi.ep.adapter.chembl.ChemblAdapterException;
import uk.ac.ebi.ep.adapter.chembl.ChemblBioactivities;
import uk.ac.ebi.ep.adapter.chembl.ChemblConfig;
import uk.ac.ebi.ep.adapter.chembl.ChemblWsAdapter;
import uk.ac.ebi.ep.adapter.chembl.IChemblAdapter;
import uk.ac.ebi.ep.centralservice.helper.CompoundUtil;
import uk.ac.ebi.ep.centralservice.helper.EbinocleParser;
import uk.ac.ebi.ep.centralservice.helper.MmDatabase;
import uk.ac.ebi.ep.centralservice.helper.Relationship;
import uk.ac.ebi.ep.data.domain.EnzymePortalCompound;
import uk.ac.ebi.ep.data.domain.UniprotEntry;
import uk.ac.ebi.ep.data.repositories.EnzymePortalCompoundRepository;
import uk.ac.ebi.ep.data.repositories.UniprotEntryRepository;
/**
* Parser for the <code>chembl-target_component.xml</code> file (ebinocle
* schema). It reads the UniProt accession of every entry and any target IDs. If
* the accession is an enzyme (that is, already existing in the mega-map) then
* the target IDs are used with the chembl-adapter to query for bioactivities,
* of which only the most significant will be kept and their ChEMBL compound IDs
* used for the cross-references. $Author$
*
* @since 1.0.22
*/
public class ChemblSaxParser extends DefaultHandler implements EbinocleParser {
private final Logger LOGGER = Logger.getLogger(ChemblSaxParser.class);
/**
* The current element (tree path) being parsed.
*/
protected Stack<String> currentContext = new Stack<>();
/**
* The text value of the current element being parsed.
*/
protected StringBuilder currentChars = new StringBuilder();
/**
* The current entry (target component, i.e. UniProt entry) being processed.
*/
private UniprotEntry entry;
/**
* The proxy used to extract information from ChEMBL (only used if
* {@link #db} = ChEMBL).
*/
private final IChemblAdapter chemblAdapter;
/* Flags to mark the current element being processed: */
private boolean isEntry;
private boolean isRef;
private boolean isField;
private boolean isAccession;
/* Values stored for the current entry being processed: */
private String accession;
private final List<String> targetIds = new ArrayList<>();
private final EnzymePortalCompoundRepository repository;
private final UniprotEntryRepository uniprotEntryRepository;
public ChemblSaxParser(EnzymePortalCompoundRepository repository, UniprotEntryRepository entryRepository) {
ChemblConfig chemblConfig = null;
try {
chemblConfig = ChemblConfig.readFromFile();
} catch (IOException e) {
LOGGER.error("Unable to read config file, using defaults", e);
chemblConfig = new ChemblConfig();
}
chemblAdapter = new ChemblWsAdapter(chemblConfig);
this.repository = repository;
this.uniprotEntryRepository = entryRepository;
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
currentContext.push(localName);
// Update flags:
String currentXpath = getCurrentXpath();
isEntry = DATABASE_ENTRIES_ENTRY.equals(currentXpath);
isRef = DATABASE_ENTRIES_ENTRY_XREFS_REF.equals(currentXpath);
isField = DATABASE_ENTRIES_ENTRY_FIELD.equals(currentXpath);
if (isField) {
String name = attributes.getValue("", "name");
isAccession = name != null && name.equals("accession");
} else if (isRef) {
String dbName = attributes.getValue("", "dbname");
if (dbName != null && dbName.equals("ChEMBL-Target")) {
targetIds.add(attributes.getValue("", "dbkey"));
}
}
// Clear placeholder:
currentChars.delete(0, Integer.MAX_VALUE);
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (isAccession) {
accession = currentChars.toString();
} else if (isEntry) {
if (accession != null && !targetIds.isEmpty()) {
//check if accession is enzyme
entry = uniprotEntryRepository.findByAccession(accession);
if (entry != null) {
LOGGER.info(accession + " is enzyme.");
computeCompound(entry);
entry = null;
}
}
accession = null;
targetIds.clear();
}
currentContext.pop();
// Update flags:
String currentXpath = getCurrentXpath();
isEntry = DATABASE_ENTRIES_ENTRY.equals(currentXpath);
isRef = false;
isField = false;
isAccession = false;
}
@Override
public void characters(char[] ch, int start, int length) {
if (isAccession) {
currentChars.append(Arrays.copyOfRange(ch, start, start + length));
}
}
private void computeCompound(UniprotEntry e) {
Set<EnzymePortalCompound> compounds = null;
try {
final Collection<String> chemblCompoundIds = getFilteredChemblCompounds();
if (chemblCompoundIds != null) {
compounds = new HashSet<>();
for (String chemblCompoundId : chemblCompoundIds) {
if (!StringUtils.isEmpty(chemblCompoundId)) {
String compoundName = chemblAdapter.getPreferredName(chemblCompoundId);
if (!StringUtils.isEmpty(compoundName)) {
EnzymePortalCompound chemblEntry = new EnzymePortalCompound();
chemblEntry.setCompoundSource(MmDatabase.ChEMBL.name());
chemblEntry.setCompoundId(chemblCompoundId);
chemblEntry.setCompoundName(compoundName);
chemblEntry.setRelationship(Relationship.between(
MmDatabase.UniProt, MmDatabase.ChEMBL)
.name());
chemblEntry.setUniprotAccession(e);
chemblEntry = CompoundUtil.computeRole(chemblEntry, chemblEntry.getRelationship());
compounds.add(chemblEntry);
}
}
}
repository.save(compounds);
}
} catch (ChemblAdapterException ex) {
LOGGER.error(targetIds, ex);
}
//return compounds;
}
/**
* Gets bioactivities from WS for the target IDs stored in the field
* <code>{@link #targetIds}</code> and filters them.
*
* @return a collection of ChEMBL compound IDs which have significant
* bioactivities, or <code>null</code> if no bioactivities are found for the
* target ID.
* @throws ChemblAdapterException
*/
private Collection<String> getFilteredChemblCompounds()
throws ChemblAdapterException {
Collection<String> filteredBioactivities = null;
for (String targetId : targetIds) {
ChemblBioactivities bioactivities
= chemblAdapter.getTargetBioactivities(targetId);
if (bioactivities != null) {
filteredBioactivities = bioactivities.filter(
chemblAdapter.getConfig().getMinAssays(),
chemblAdapter.getConfig().getMinConf4(),
chemblAdapter.getConfig().getMinConf9(),
chemblAdapter.getConfig().getMinFunc());
LOGGER.debug(bioactivities.getMap().size()
+ " bioactivities down to "
+ filteredBioactivities.size());
}
}
return filteredBioactivities;
}
protected String getCurrentXpath() {
StringBuilder xpath = new StringBuilder("/");
for (String string : currentContext) {
xpath.append('/').append(string);
}
return xpath.toString();
}
/**
* Parses a XML file and stores cross-references in a database.<br>
* This method is not thread safe.
*
* @param xmlFilePath the XML file to parse
* @throws java.io.FileNotFoundException if the XML file is not
* found or not readable.
* @throws org.xml.sax.SAXException if no default XMLReader can be found or
* instantiated, or exception during parsing.
* @throws java.io.IOException if the file cannot be opened,
* or from the parser.
*/
public void parse(String xmlFilePath) throws Exception {
File xmlFile = new File(xmlFilePath);
parse(new FileInputStream(xmlFile));
}
private void parse(InputStream is) throws IOException, SAXException {
try {
XMLReader xr = XMLReaderFactory.createXMLReader();
xr.setContentHandler(this);
xr.setErrorHandler(this);
InputSource source = new InputSource(is);
LOGGER.info("Parsing start");
xr.parse(source);
LOGGER.info("Parsing end");
} catch (IOException | SAXException e) {
LOGGER.error("During parsing", e);
throw e;
}
}
}