/* * This file is part of ADDIS (Aggregate Data Drug Information System). * ADDIS is distributed from http://drugis.org/. * Copyright © 2009 Gert van Valkenhoef, Tommi Tervonen. * Copyright © 2010 Gert van Valkenhoef, Tommi Tervonen, Tijs Zwinkels, * Maarten Jacobs, Hanno Koeslag, Florin Schimbinschi, Ahmad Kamal, Daniel * Reid. * Copyright © 2011 Gert van Valkenhoef, Ahmad Kamal, Daniel Reid, Florin * Schimbinschi. * Copyright © 2012 Gert van Valkenhoef, Daniel Reid, Joël Kuiper, Wouter * Reckman. * Copyright © 2013 Gert van Valkenhoef, Joël Kuiper. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.drugis.addis.util; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import javax.xml.bind.JAXBException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathFactory; import org.apache.commons.lang.StringUtils; import org.drugis.addis.entities.Arm; import org.drugis.addis.entities.BasicStudyCharacteristic; import org.drugis.addis.entities.CategoricalVariableType; import org.drugis.addis.entities.Domain; import org.drugis.addis.entities.FrequencyMeasurement; import org.drugis.addis.entities.Note; import org.drugis.addis.entities.PopulationCharacteristic; import org.drugis.addis.entities.PubMedIdList; import org.drugis.addis.entities.Source; import org.drugis.addis.entities.Study; import org.drugis.addis.entities.StudyOutcomeMeasure; import org.drugis.addis.entities.Variable; import org.drugis.addis.entities.WhenTaken; import org.drugis.addis.entities.data.AddisData; import org.drugis.addis.imports.PubMedIDRetriever; import org.drugis.addis.util.jaxb.JAXBConvertor; import org.drugis.addis.util.jaxb.JAXBConvertor.ConversionException; import org.drugis.addis.util.jaxb.JAXBHandler; import org.drugis.common.beans.AffixedObservableList; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.jgoodies.binding.list.ObservableList; /** * Used to clean up the Diabetes Dataset on http://mantis.drugis.org/file_download.php?file_id=29 */ public class ConvertDiabetesDatasetUtil { private Domain d_domain; public ConvertDiabetesDatasetUtil(Domain domain) { d_domain = domain; } public static void main(String[] args) throws JAXBException, ConversionException, IOException { InputStream is = new FileInputStream("diabetes-cleaner.addis"); AddisData addisData = JAXBHandler.unmarshallAddisData(is); is.close(); Domain domain = JAXBConvertor.convertAddisDataToDomain(addisData); ConvertDiabetesDatasetUtil util = new ConvertDiabetesDatasetUtil(domain); util.run(); final AddisData out = JAXBConvertor.convertDomainToAddisData(domain); OutputStream fileWrite = new FileOutputStream("converted.addis"); JAXBHandler.marshallAddisData(out, fileWrite); fileWrite.close(); } private void run() throws IOException { changeRaceEndpoints(); removeMeasuredOnceOutcomeMeasures(); renameStudies(); } private void renameStudies() throws IOException { for(Study study : d_domain.getStudies()) { PubMedIdList pubmed = (PubMedIdList)study.getCharacteristic(BasicStudyCharacteristic.PUBMED); try { Document doc = getPubMedXML(pubmed); XPathFactory factory = XPathFactory.newInstance(); XPath xpath = factory.newXPath(); XPathExpression yearExpr = xpath.compile("/PubmedArticleSet/PubmedArticle[1]/MedlineCitation[1]/DateCreated[1]/Year[1]"); Object yearResults = yearExpr.evaluate(doc, XPathConstants.NODESET); String year = ((NodeList) yearResults).item(0).getTextContent(); XPathExpression authorExpr = xpath.compile("/PubmedArticleSet/PubmedArticle[1]/MedlineCitation[1]/Article[1]/AuthorList[1]/Author/LastName"); Object authorResults = authorExpr.evaluate(doc, XPathConstants.NODESET); NodeList authorNodes = (NodeList)authorResults; List<String> authors = new ArrayList<String>(); for (int i = 0; i < authorNodes.getLength(); i++) { authors.add(authorNodes.item(i).getTextContent()); } String title = ""; if(authors.size() > 2) { title = authors.get(0) + " et al, " + year; } else { title = StringUtils.join(authors, ", ") + ", " + year; } study.setName(title); } catch (Exception e) { continue; } } } private Document getPubMedXML(PubMedIdList pubmed) throws ParserConfigurationException, IOException, SAXException { String id = pubmed.get(0).getId(); String url = PubMedIDRetriever.PUBMED_API + "efetch.fcgi?db=pubmed&id=" + id + "&retmode=xml"; DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setValidating(false); dbf.setNamespaceAware(false); dbf.setIgnoringElementContentWhitespace(true); DocumentBuilder db = dbf.newDocumentBuilder(); InputStream openStream = PubMedIDRetriever.openUrl(url); Document doc = db.parse(openStream); return doc; } private void removeMeasuredOnceOutcomeMeasures() { List<Variable> variables = new ArrayList<Variable>(); variables.addAll(d_domain.getEndpoints()); variables.addAll(d_domain.getAdverseEvents()); variables.addAll(d_domain.getPopulationCharacteristics()); for(Variable var : variables) { ObservableList<Study> studies = d_domain.getStudies(var); if(studies.getSize() < 2) { for(Study study : studies) { StudyOutcomeMeasure<Variable> som = study.findStudyOutcomeMeasure(var); study.getStudyOutcomeMeasures().remove(som); } d_domain.getAdverseEvents().remove(var); d_domain.getEndpoints().remove(var); d_domain.getPopulationCharacteristics().remove(var); } } } private void changeRaceEndpoints() { PopulationCharacteristic newChar = EntityUtil.findByName(d_domain.getPopulationCharacteristics(), "Race (Taxonomic)"); ObservableList<String> newCats = ((CategoricalVariableType)newChar.getVariableType()).getCategories(); for(Study study : d_domain.getStudies()) { StudyOutcomeMeasure<PopulationCharacteristic> oldSom = getPopulationChar(d_domain, study); if(oldSom == null) continue; ObservableList<String> oldCats = ((CategoricalVariableType) oldSom.getValue().getVariableType()).getCategories(); StudyOutcomeMeasure<PopulationCharacteristic> newSom = oldSom.clone(); newSom.setValue(newChar); study.getStudyOutcomeMeasures().add(study.getStudyOutcomeMeasures().indexOf(oldSom), newSom); for(WhenTaken wt : oldSom.getWhenTaken()) { for(Arm arm : AffixedObservableList.createSuffixed(study.getArms(), (Arm)null)) { FrequencyMeasurement m = (FrequencyMeasurement) study.getMeasurement(oldSom.getValue(), arm, wt); if(m == null) continue; FrequencyMeasurement newFreq = new FrequencyMeasurement(newChar); for(String oldCat : oldCats) { setNewFreq(newCats, m, newFreq, oldCat); } study.setMeasurement(newSom, arm, wt, newFreq); } } newSom.getNotes().add(new Note(Source.MANUAL, "Re-encoded from: " + StringUtils.join(oldCats, ", "))); study.getStudyOutcomeMeasures().remove(oldSom); } } private void setNewFreq(ObservableList<String> newCats, FrequencyMeasurement oldFreq, FrequencyMeasurement newFreq, String oldCat) { oldCat = StringUtils.capitalize(oldCat); int frequency = oldFreq.getFrequency(oldCat.toLowerCase()); String newCat = null; for(String cat : newCats) { if(oldCat.toLowerCase().startsWith(cat.toLowerCase())) { newCat = cat; } } oldCat = (newCat == null) ? "Other" : newCat; Integer f = newFreq.getFrequency(oldCat); newFreq.setFrequency(oldCat, (f == null ? 0 : f) + frequency); } private StudyOutcomeMeasure<PopulationCharacteristic> getPopulationChar(Domain domainData, Study study) { for(StudyOutcomeMeasure<PopulationCharacteristic> popChar : study.getPopulationChars()) { if(popChar.getValue().getName().matches("(?i)race.*")) { return popChar; } } return null; } }