/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.events.XMLEvent;
import act.server.PubmedEntry;
abstract class IterativeParser {
abstract Object getNext(); // get data from the data source
}
public class PubmedParser extends IterativeParser {
String DataDir;
int StartIndex;
int EndIndex;
static String DataPrefix = "medline11n";
static String DataSuffix = ".xml"; // the real medline data
int currentIndex;
XMLStreamReader xml;
public PubmedParser(String dataDir) {
this.StartIndex = 1;
this.EndIndex = 653;
this.currentIndex = StartIndex;
this.xml = null;
this.DataDir = dataDir;
initCurrentXML();
}
public PubmedParser(String dataDir, int start, int end) {
if (start < 1 || start > 653 || end < 1 || end > 653 || start > end) {
System.err.println("Start and end should be between [1, 653]");
System.exit(-1);
}
this.StartIndex = start;
this.EndIndex = end;
this.currentIndex = StartIndex;
this.xml = null;
this.DataDir = dataDir;
initCurrentXML();
}
@Override
public Object getNext() {
try {
if (this.xml == null) {
return null;
}
boolean reachedEnd = !moveToNextEntry();
if (reachedEnd) {
// we have reached the end of this XML, so now move to the next...
this.xml = null;
// two cases: 1) switch to next file, or 2) either done with all: do nothing
if (this.currentIndex <= this.EndIndex) {
initCurrentXML();
if (!moveToNextEntry())
throw new CiderPubmedFormatException("Data file contains no entries?");
}
}
if (this.xml == null) {
// if done with all data from all files
return null;
}
// if data elements remain then return the next one
PubmedEntry entry = getNextDataElem();
return entry;
} catch (CiderPubmedFormatException ex) {
ex.printStackTrace();
System.err.println("Data elem parsing error.");
System.exit(-1);
} catch (XMLStreamException ex) {
System.err.println("Could not step to next data piece.");
System.exit(-1);
}
return null; // unreachable
}
private boolean moveToNextEntry() throws XMLStreamException, CiderPubmedFormatException {
int eventType = this.xml.next();
while (this.xml.isWhiteSpace() || eventType == XMLEvent.SPACE)
eventType = this.xml.next();
if (eventType == XMLEvent.END_ELEMENT && this.xml.getLocalName().equals("MedlineCitationSet")) {
return false;
} else if (!(eventType == XMLEvent.START_ELEMENT && this.xml.getLocalName().equals("MedlineCitation"))) {
throw new CiderPubmedFormatException("Expecting entry start");
}
return true;
}
private void initCurrentXML() {
if (this.xml != null) {
System.err.println("init XML called when the previous XML was not finished yet.");
System.exit(-1);
}
String currentfile = this.DataDir + PubmedParser.DataPrefix + String.format("%04d", this.currentIndex) + PubmedParser.DataSuffix;
try {
FileInputStream fileInputStream = new FileInputStream(currentfile);
this.xml = XMLInputFactory.newInstance().createXMLStreamReader(fileInputStream);
expect(XMLEvent.DTD, null);
expect(XMLEvent.START_ELEMENT, "MedlineCitationSet");
} catch (XMLStreamException ex) {
ex.printStackTrace();
System.out.println("XML stream error on file:" + currentfile);
System.exit(-1);
} catch (FileNotFoundException e) {
System.err.println("Could not find XML file: " + currentfile);
System.exit(-1);
} catch (CiderPubmedFormatException ex) {
System.err.println("XML file does not start with DTD, MedlineCitationSet: " + currentfile);
System.exit(-1);
}
this.currentIndex++;
}
private PubmedEntry getNextDataElem() throws XMLStreamException, CiderPubmedFormatException {
List<String> tagStack = new ArrayList<String>();
tagStack.add("MedlineCitation"); // the start_element has already been read through, so artifically add it
HashMap<String, Object> data = new HashMap<String, Object>();
do {
readExceptedTags(data, tagStack);
} while (!tagStack.isEmpty());
HashMap<String, List<String>> allXML = makeStrLists(data);
return new PubmedEntry(allXML);
}
private void expect(int expType, String expTag) throws XMLStreamException, CiderPubmedFormatException {
try {
int eventType = this.xml.next();
while (this.xml.isWhiteSpace() || eventType == XMLEvent.SPACE)
eventType = this.xml.next();
if (eventType != expType)
throw new CiderPubmedFormatException(expType + " entry expected. Not found.");
if (expTag != null && !this.xml.getLocalName().equals(expTag))
throw new CiderPubmedFormatException(expTag + " entry expected. Not found.");
} catch (XMLStreamException ex) {
System.err.println("NOTE: This error could be because you don't have a net connection that can fetch the right DTD schema.");
throw ex;
}
}
private String flatten(Object obj, String delim) {
if (obj == null)
// sometimes no elements are present: return null
return null;
String flat = null;
if (obj instanceof List) {
List<String> abstr = (List<String>) obj;
for (String s : abstr) {
flat = (flat == null ? s : (flat + delim + s));
}
} else {
flat = (String) obj;
}
return flat;
}
private HashMap<String, String> flattenAll(HashMap<String, Object> data, String delim) {
HashMap<String, String> flat = new HashMap<String, String>();
for (String k : data.keySet())
flat.put(k, flatten(data.get(k), delim));
return flat;
}
private HashMap<String, List<String>> makeStrLists(HashMap<String, Object> data) {
HashMap<String, List<String>> typeCasted = new HashMap<String, List<String>>();
for (String k : data.keySet()) {
Object obj = data.get(k);
if (obj == null) { // sometimes no elements are present
typeCasted.put(k, null);
} else if (obj instanceof List) {
typeCasted.put(k, (List<String>) obj);
} else if (obj instanceof String) {
List<String> l = new ArrayList<String>();
l.add((String) obj);
typeCasted.put(k, l);
} else {
System.err.println("Found data element of unknown type: " + obj);
System.exit(-1);
}
}
return typeCasted;
}
private String readExceptedTags(HashMap<String, Object> data, List<String> tagStack) throws XMLStreamException, CiderPubmedFormatException {
String tag;
int eventType = this.xml.next();
while (this.xml.isWhiteSpace() || eventType == XMLEvent.SPACE)
eventType = this.xml.next();
switch (eventType) {
case XMLEvent.START_ELEMENT:
// push onto stack
tag = this.xml.getLocalName();
tagStack.add(0, tag);
return tag;
case XMLEvent.END_ELEMENT:
// pop stack
tag = this.xml.getLocalName();
if (tagStack.isEmpty())
throw new CiderPubmedFormatException("Tag end mismatch: (empty) " + " vs " + tag);
if (!tagStack.get(0).equals(tag))
throw new CiderPubmedFormatException("Tag end mismatch: " + tagStack.get(0) + " vs " + tag);
tagStack.remove(0);
return tag;
case XMLEvent.CHARACTERS:
String txt = this.xml.getText();
addToMap(data, txt, pathFromStack(tagStack));
return null; // do not return a tag indicator for text
case XMLEvent.COMMENT:
throw new CiderPubmedFormatException("Pubmed contains XML comments? Not expected here.");
case XMLEvent.START_DOCUMENT:
case XMLEvent.END_DOCUMENT:
throw new CiderPubmedFormatException("Start/End_Document not expected here.");
case XMLEvent.ENTITY_REFERENCE:
case XMLEvent.ATTRIBUTE:
case XMLEvent.PROCESSING_INSTRUCTION:
throw new CiderPubmedFormatException("Processing instr/Entity reference/Attribute not expected here.");
case XMLEvent.DTD:
throw new CiderPubmedFormatException("DTD not expected here.");
case XMLEvent.CDATA:
throw new CiderPubmedFormatException("CDATA not expected here.");
case XMLEvent.SPACE:
throw new CiderPubmedFormatException("SPACE not expected here.");
}
throw new CiderPubmedFormatException("Unknown tag seen!");
}
private void addToMap(HashMap<String, Object> map, String txt, String path) throws CiderPubmedFormatException {
if (!map.containsKey(path)) {
// simple case, when unique string for path.
map.put(path, txt);
return;
} else {
// map already contains key; so need to convert to list<string>
Object old = map.get(path);
List<String> ls;
if (old instanceof List) {
ls = (List<String>) old;
ls.add(txt);
} else if (old instanceof String) {
ls = new ArrayList<String>();
ls.add((String) old);
ls.add(txt);
} else
throw new CiderPubmedFormatException("Something other than a String or [String], not possible");
;
map.put(path, ls);
}
}
private String pathFromStack(List<String> stk) {
String s = stk.get(0);
for (int i = 1; i < stk.size(); i++)
s = stk.get(i) + "/" + s;
return s;
}
private void getNextTagData() throws XMLStreamException {
int eventType = this.xml.next();
switch (eventType) {
case XMLEvent.START_ELEMENT:
System.out.println("START_ELEMENT: " + this.xml.getLocalName());
return;
case XMLEvent.END_ELEMENT:
System.out.println("END_ELEMENT: " + this.xml.getLocalName());
return;
case XMLEvent.PROCESSING_INSTRUCTION:
System.out.println("PROCESSING_INSTRUCTION: " + this.xml.getLocalName());
return;
case XMLEvent.CHARACTERS:
System.out.println("CHARACTERS: " + this.xml.getText());
return;
case XMLEvent.COMMENT:
System.out.println("COMMENT: " + this.xml.getText());
return;
case XMLEvent.START_DOCUMENT:
System.out.println("START_DOCUMENT: " + this.xml.getLocalName());
return;
case XMLEvent.END_DOCUMENT:
System.out.println("END_DOCUMENT: " + this.xml.getLocalName());
return;
case XMLEvent.ENTITY_REFERENCE:
System.out.println("ENTITY_REFERENCE: " + this.xml.getLocalName());
return;
case XMLEvent.ATTRIBUTE:
System.out.println("ATTRIBUTE: " + this.xml.getLocalName());
return;
case XMLEvent.DTD:
System.out.println("DTD");
return;
case XMLEvent.CDATA:
System.out.println("CDATA");
return;
case XMLEvent.SPACE:
System.out.println("SPACE");
return;
}
System.out.println("Something else...");
}
private PubmedEntry readEntireFileToScreen() throws XMLStreamException {
while (this.xml.hasNext()) getNextTagData();
return null;
}
}
class KnownDBs {
static String GetDBRef(Names db, int id) {
return "#" + db.name() + "(" + id + ")";
}
public enum Names {PMID, GenBank, RefSeq}
}