/*******************************************************************************
* Copyright 2014 Virginia Polytechnic Institute and State University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package edu.vt.vbi.patric.common.xmlHandler;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@SuppressWarnings("unchecked")
public class GEOHandler extends DefaultHandler {
private JSONArray list = null;
private JSONObject docsum = null;
private String currentElement = "";
private boolean isReadingPubMedIds = false;
private StringBuffer sbTitle = null;
private StringBuffer sbTaxon = null;
private StringBuffer sbType = null;
private StringBuffer sbSummary = null;
private StringBuffer sbPubMedID = null;
private StringBuffer sbPDAT = null;
public JSONArray getParsedJSON() {
return list;
}
@Override
public void startDocument() throws SAXException {
list = new JSONArray();
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
if (qName.equalsIgnoreCase("DocSum")) {
docsum = new JSONObject();
sbTitle = new StringBuffer();
sbTaxon = new StringBuffer();
sbType = new StringBuffer();
sbSummary = new StringBuffer();
sbPubMedID = new StringBuffer();
sbPDAT = new StringBuffer();
}
if (qName.equalsIgnoreCase("Item")) {
if (atts.getValue("Name").equals("title") || atts.getValue("Name").equals("summary") || atts.getValue("Name").equals("taxon")
|| atts.getValue("Name").equals("entryType") || atts.getValue("Name").equals("GSE") || atts.getValue("Name").equals("GPL")
|| atts.getValue("Name").equals("GDS") || atts.getValue("Name").equals("gdsType") || atts.getValue("Name").equals("PDAT")
|| atts.getValue("Name").equals("n_samples") || atts.getValue("Name").equals("ptechType")
|| atts.getValue("Name").equals("subsetInfo") || atts.getValue("Name").equals("suppFile")) {
currentElement = atts.getValue("Name");
if (isReadingPubMedIds) {
isReadingPubMedIds = false;
}
}
else if (atts.getValue("Name").equals("PubMedIds")) {
currentElement = "";
isReadingPubMedIds = true;
}
else if (atts.getValue("Name").equals("int")) {
currentElement = atts.getValue("Name");
}
else {
currentElement = "";
}
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (qName.equalsIgnoreCase("DocSum")) {
docsum.put("title", sbTitle.toString());
docsum.put("taxon", sbTaxon.toString());
docsum.put("summary", sbSummary.toString());
// setting ID
if (docsum.get("entryType").equals("GSE")) {
docsum.put("ID", "GSE" + docsum.get("GSE"));
}
else if (docsum.get("entryType").equals("GPL")) {
docsum.put("ID", "GPL" + docsum.get("GPL"));
}
else if (docsum.get("entryType").equals("GDS")) {
docsum.put("ID", "GDS" + docsum.get("GDS"));
}
// setting Data Type, Exp Type, & download links
if (docsum.get("entryType").equals("GPL")) {
docsum.put("dataType", "Platform");
docsum.put("expType", docsum.get("ptechType"));
docsum.put("link_soft_format", "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_platform/" + docsum.get("ID") + "/" + docsum.get("ID")
+ "_family.soft.gz");
docsum.put("link_miniml_format",
"ftp://ftp.ncbi.nih.gov/pub/geo/DATA/MINiML/by_platform/" + docsum.get("ID") + "/" + docsum.get("ID") + "_family.xml.tgz");
if (docsum.get("suppFile") != null && !docsum.get("suppFile").equals("")) {
docsum.put("link_supplementary", "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/platforms/" + docsum.get("ID") + "/");
}
}
else if (docsum.get("entryType").equals("GSE")) {
docsum.put("dataType", "Series");
docsum.put("expType", sbType.toString());
docsum.put("link_soft_format", "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/" + docsum.get("ID") + "/" + docsum.get("ID")
+ "_family.soft.gz");
docsum.put("link_miniml_format", "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/MINiML/by_series/" + docsum.get("ID") + "/" + docsum.get("ID")
+ "_family.xml.tgz");
docsum.put("link_seriesmatrix_format",
"ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SeriesMatrix/" + docsum.get("ID") + "/" + docsum.get("ID") + "_series_matrix.txt.gz");
if (docsum.get("suppFile") != null && !docsum.get("suppFile").equals("")) {
docsum.put("link_supplementary",
"ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/series/" + docsum.get("ID") + "/" + docsum.get("ID") + "_RAW.tar");
}
}
else if (docsum.get("entryType").equals("GDS")) {
docsum.put("dataType", "Datasets");
docsum.put("expType", sbType.toString());
docsum.put("link_soft_format", "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/" + docsum.get("ID") + ".soft.gz");
}
if (docsum.get("GPL") != null && !docsum.get("GPL").equals("")) {
if (docsum.get("GPL").toString().contains(";")) {
docsum.put("platform", "GPL" + docsum.get("GPL").toString().replaceAll(";", ";GPL"));
}
else {
docsum.put("platform", "GPL" + docsum.get("GPL"));
}
}
docsum.put("pubmed_id", sbPubMedID.toString());
docsum.put("PDAT", sbPDAT.toString());
list.add(docsum);
this.docsum = null;
this.sbTitle = null;
this.sbTaxon = null;
this.sbType = null;
this.sbSummary = null;
this.sbPubMedID = null;
this.sbPDAT = null;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
String tmpVal = new String(ch, start, length);
if (currentElement.equals("int")) {
if (isReadingPubMedIds == true) {
// docsum.put("pubmed_id", tmpVal);
if (!tmpVal.trim().equalsIgnoreCase("")) {
if (sbPubMedID.length() > 0) {
sbPubMedID.append(",");
}
sbPubMedID.append(tmpVal);
}
// isReadingPubMedIds = false;
}
}
else if (currentElement.equals("title")) {
sbTitle.append(tmpVal);
}
else if (currentElement.equals("taxon")) {
sbTaxon.append(tmpVal);
}
else if (currentElement.equals("gdsType")) {
sbType.append(tmpVal);
}
else if (currentElement.equals("summary")) {
sbSummary.append(tmpVal);
}
else if (currentElement.equals("PDAT")) {
sbPDAT.append(tmpVal);
}
else if (!currentElement.equals("")) {
docsum.put(currentElement, tmpVal);
}
}
}