/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
/**
* The main handler for the medline documents. Handles the top level node
* <MedlineCitation>
*
* @author Ariel Schwartz
* @author Gaurav Bhalotia
*
*/
package org.erasmusmc.dataimport.Medline.xmlparsers.medline;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Types;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.erasmusmc.dataimport.Medline.xmlparsers.GenericXMLParser;
import org.erasmusmc.dataimport.Medline.xmlparsers.NodeHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
public class MedlineCitation extends NodeHandler {
private String pmid;
private int authorPlace = 0;
private static boolean isIncremental = true;
private List<String> months = getMonths();
private static List<String> getMonths(){
List<String> result = new ArrayList<String>(12);
result.add("Jan");
result.add("Feb");
result.add("Mar");
result.add("Apr");
result.add("May");
result.add("Jun");
result.add("Jul");
result.add("Aug");
result.add("Sep");
result.add("Oct");
result.add("Nov");
result.add("Dec");
return result;
}
/* Default constructor
*
* @param xmlFileName The name of the file that is being parsed, to be stored in the
* record for medline_citation
*/
public MedlineCitation(String xmlFileName) throws Exception {
ignoreDuplicateKeyError = true;
/* The table in which the parsed entries are to be entered from this node */
tableName = "medline_citation";
/* The node name being handled by this class */
xmlNodeName = "MedlineCitation";
//Note: If anything is added to the MedlineCitation node, you have to change four things:
//1. Add a field to the table in the database
//2-4. Change the three arrays below
/* The various column names in the table */
String[] columnNameDef = { "pmid", "date_created", "date_completed", "date_revised", "issn", "volume", "issue", "pub_date_year", "pub_date_month", "pub_date_day", "pub_date_season", "medline_date", "journal_print_yn", "coden", "journal_title", "iso_abbreviation", "article_title", "start_page", "end_page", "medline_pgn", "abstract_text", "copyright_info", "article_affiliation", "article_author_list_comp_yn", "data_bank_list_comp_yn", "grantlist_complete_yn", "vernacular_title", "date_of_electronic_publication", "elec_pub_official_date_yn", "country", "medline_ta", "nlm_unique_id", "xml_file_name", "number_of_references", "keyword_list_owner", "citation_owner", "article_date_day", "article_date_month", "article_date_year", "cited_medium", "issn_type", "pub_model", "article_date_type", "citation_status", "elocationid", "elocationid_eidtype","elocationid_validyn","pub_date","issn_linking" };
columnName = columnNameDef;
/* The corresponding XML tags, The tags names starts from the current xmlnode */
String[] xmlElementNameDef = { "MedlineCitation.PMID", "MedlineCitation.DateCreated", "MedlineCitation.DateCompleted", "MedlineCitation.DateRevised", "MedlineCitation.Article.Journal.ISSN", "MedlineCitation.Article.Journal.JournalIssue.Volume", "MedlineCitation.Article.Journal.JournalIssue.Issue", "MedlineCitation.Article.Journal.JournalIssue.PubDate.Year", "MedlineCitation.Article.Journal.JournalIssue.PubDate.Month", "MedlineCitation.Article.Journal.JournalIssue.PubDate.Day", "MedlineCitation.Article.Journal.JournalIssue.PubDate.Season", "MedlineCitation.Article.Journal.JournalIssue.PubDate.MedlineDate", "MedlineCitation.Article.Journal.JournalIssue.PrintYN", "MedlineCitation.Article.Journal.Coden", "MedlineCitation.Article.Journal.Title", "MedlineCitation.Article.Journal.ISOAbbreviation", "MedlineCitation.Article.ArticleTitle", "MedlineCitation.Article.Pagination.StartPage", "MedlineCitation.Article.Pagination.EndPage", "MedlineCitation.Article.Pagination.MedlinePgn", "MedlineCitation.Article.Abstract.AbstractText", "MedlineCitation.Article.Abstract.CopyrightInformation", "MedlineCitation.Article.Affiliation", "MedlineCitation.Article.AuthorList.CompleteYN", "MedlineCitation.Article.DataBankList.CompleteYN", "MedlineCitation.Article.GrantList.CompleteYN", "MedlineCitation.Article.VernacularTitle", "MedlineCitation.Article.ElectronicPubDate", "MedlineCitation.Article.ElectronicPubDate.OfficialDateYN", "MedlineCitation.MedlineJournalInfo.Country", "MedlineCitation.MedlineJournalInfo.MedlineTA", "MedlineCitation.MedlineJournalInfo.NlmUniqueID", "XmlFileName", "MedlineCitation.NumberOfReferences", "MedlineCitation.KeywordList.Owner", "MedlineCitation.Owner", "MedlineCitation.Article.ArticleDate.Day", "MedlineCitation.Article.ArticleDate.Month", "MedlineCitation.Article.ArticleDate.Year", "MedlineCitation.Article.Journal.JournalIssue.CitedMedium", "MedlineCitation.Article.Journal.ISSN.IssnType", "MedlineCitation.Article.PubModel", "MedlineCitation.Article.ArticleDate.DateType", "MedlineCitation.Status", "MedlineCitation.Article.ELocationID", "MedlineCitation.Article.ELocationID.EIdType","MedlineCitation.Article.ELocationID.ValidYN", "MedlineCitation.Article.Journal.JournalIssue.PubDate", "MedlineCitation.MedlineJournalInfo.ISSNLinking"};
xmlElementName = xmlElementNameDef;
/* The SQL types for the various columns above */
int columnTypeDef[] = { Types.INTEGER, Types.DATE, Types.DATE, Types.DATE, Types.CHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.CHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.CLOB, Types.VARCHAR, Types.VARCHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.VARCHAR, Types.DATE, Types.CHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.INTEGER, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR};
columnType = columnTypeDef;
initialize();
/* Add any data that does not come through XML to the hashtable */
putColumnValue("XmlFileName", xmlFileName);
}
/**
* The method to handle the event when a new element is found, this is overwriting
* the method defined in the super class NodeHandler.java
*/
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
NodeHandler handler;
String descriptorName = null;
String majorTopicYN = null;
/* Take decisions based on the element found, if it needs to be handled
* by a child handler then instantiate an object for the same and set the handler
* else call the handler from the super class
*/
try {
if (currentElement != null) {
if (pmid == null)
pmid = getColumnValue("MedlineCitation.PMID");
if (qName.equals("AuthorList")) {
//DELETE Data for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_author where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
}
else if (qName.equals("ChemicalList")) {
// DELETE Data for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_chemical_list where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
}
else if (qName.equals("DataBankList")) {
//DELETE Data for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_data_bank where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
}
else if (qName.equals("GrantList")) {
//DELETE Data for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_grant where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
}
else if (qName.equals("KeywordList")) {
//DELETE Data for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_keyword_list where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
}
else if (qName.equals("MeshHeadingList")) {
//DELETE Data for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_mesh_heading where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
}
else if (qName.equals("PersonalNameSubjectList")) {
//DELETE Data for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_personal_name_subject where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
}
if (currentElement.equals("AuthorList") && qName.equals("Author")) {
handler = new Author(pmid, authorPlace++);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("AccessionNumberList") && qName.equals("AccessionNumber")) {
String dataBankName = getColumnValue("MedlineCitation.Article.DataBankList.DataBank.DataBankName");
handler = new DataBank(pmid, dataBankName);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("ChemicalList") && qName.equals("Chemical")) {
handler = new Chemical(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("GeneSymbolList") && qName.equals("GeneSymbol")) {
handler = new GeneSymbol(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("KeywordList") && qName.equals("Keyword")) {
handler = new Keyword(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("PublicationTypeList") && qName.equals("PublicationType")) {
handler = new ArticlePublicationType(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("GrantList") && qName.equals("Grant")) {
handler = new Grant(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("MeshHeadingList") && qName.equals("MeshHeading")) {
handler = new MeshHeading(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("CommentsCorrectionsList")) {
//DELETE Corrections for PMID on update if exists
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_comments_corrections where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
handler = new CommentsCorrections(pmid, qName);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (qName.equals("CitationSubset")) {
handler = new CitationSubset(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (qName.equals("Language")) {
handler = new ArticleLanguage(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("PersonalNameSubjectList") && qName.equals("PersonalNameSubject")) {
handler = new PersonalNameSubject(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (qName.equals("OtherID")) {
handler = new OtherID(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (qName.equals("OtherAbstract")) {
handler = new OtherAbstract(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (qName.equals("SpaceFlightMission")) {
handler = new SpaceFlightMission(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (currentElement.equals("InvestigatorList") && qName.equals("Investigator")) {
handler = new Investigator(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else if (qName.equals("GeneralNote")) {
handler = new GeneralNote(pmid);
setContentHandler(handler, namespaceURI, localName, qName, atts);
}
else {
super.startElement(namespaceURI, localName, qName, atts);
}
}
else {
super.startElement(namespaceURI, localName, qName, atts);
}
} catch (Exception e) {
e.printStackTrace();
throw new SAXException("Problem creating Child. PMID: " + pmid + " for element " + qName);
}
}
/* Extends the endElement method from the super class NodeHandler
* checks if a medline citation has ended, in which case flushes the
* values found including all the childhandler to the database
*/
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (qName.equals("PubDate"))
storePubDate();
super.endElement(namespaceURI, localName, qName);
if (qName.equals(xmlNodeName)) {
try {
updateDB();
} catch (Exception e) {
e.printStackTrace();
throw new SAXException("problem updating the database");
}
}
}
//Publication date is stored in separate table in Date format for large queries:
private void storePubDate() {
String yearString = (String)columnValues.get("MedlineCitation.Article.Journal.JournalIssue.PubDate.Year");
String monthString = (String)columnValues.get("MedlineCitation.Article.Journal.JournalIssue.PubDate.Month");
String dayString = (String)columnValues.get("MedlineCitation.Article.Journal.JournalIssue.PubDate.Day");
String medlineString = (String)columnValues.get("MedlineCitation.Article.Journal.JournalIssue.PubDate.MedlineDate");
String date = parseDate(yearString, monthString, dayString, medlineString);
if (date == null)
System.err.println("No valid publication date for PMID " + pmid);
else {
columnValues.put("MedlineCitation.Article.Journal.JournalIssue.PubDate", date);
}
/*String sql = "REPLACE INTO pmid_date (pmid,pub_date) VALUES (" + pmid + ",\"" + date + "\")";
try {
PreparedStatement pstmt_insert = GenericXMLParser.getDbConnection().prepareStatement(sql);
pstmt_insert.executeUpdate();
pstmt_insert.close();
} catch (SQLException e) {
System.err.println("ERROR IN SQL: " + sql);
e.printStackTrace();
}
}*/
}
private static Pattern yearPattern = Pattern.compile("(19|20)[0-9][0-9]");
private String parseDate(String yearString, String monthString, String dayString, String medlineString) {
String year = null;
if (yearString == null){
if (medlineString == null)
return null;
Matcher matcher = yearPattern.matcher(medlineString);
if (matcher.find())
year = matcher.group();
} else {
year = yearString;
}
String month = null;
if (monthString == null){
month = "1";
if (medlineString != null){
for (int i = 0; i < months.size(); i++){
if (medlineString.contains(months.get(i))){
month = Integer.toString(i+1);
break;
}
}
}
} else {
month = Integer.toString(months.indexOf(monthString)+1).toString();
}
String day = dayString == null ? "1" : dayString;
return year + "-" + month + "-"+ day;
}
/**
* Handles SQLException. Should be overloaded by inheriting classes to handle special cases
* @returns true if the exception has been handled, false otherwise
*/
@Override
protected boolean handleSQLException(SQLException e) {
if (ignoreDuplicateKeyError && e.getErrorCode() == DB2_DUPLICATE_ERROR) {
if (isIncremental) {
try {
PreparedStatement pstmt_delete = GenericXMLParser.getDbConnection().prepareStatement("DELETE FROM medline_citation where pmid = ?");
pstmt_delete.setInt(1, Integer.parseInt(pmid));
pstmt_delete.executeUpdate();
pstmt_delete.close();
/* Execute the insert again. Note this could cause problems in multithreaded implementation */
pstmt.executeUpdate();
return true;
} catch (SQLException e1) {
/* Doesn't work again so give up and report */
MedlineParser.eCount++;
if (MedlineParser.eCount % 500 == 0) {
System.out.println("Total " + MedlineParser.eCount + " Values not inserted");
}
updateChildren = false;
return true;
}
}
else {
MedlineParser.eCount++;
if (MedlineParser.eCount % 500 == 0) {
System.out.println("Total " + MedlineParser.eCount + " Values not inserted");
}
updateChildren = false;
return true;
/* Don't do anything, the tuple for this primary key has already been inserted */
}
}
else {
System.err.println("ERROR CODE == " + e.getErrorCode());
return false;
}
}
}