/* Copyright (C) 2002-2003 Morten O. Alver & Nizar N. Batada All programs in this directory and subdirectories are published under the GNU General Public License as described below. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Further information about the GNU GPL is available at: http://www.gnu.org/copyleft/gpl.ja.html */ package net.sf.jabref.imports; import java.util.ArrayList; import java.util.Iterator; import java.util.TreeSet; import net.sf.jabref.BibtexEntry; import net.sf.jabref.Globals; import net.sf.jabref.Util; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; public class MedlineHandler extends DefaultHandler { ArrayList<BibtexEntry> bibitems= new ArrayList<BibtexEntry>(); boolean inTitle=false, inYear = false, inJournal = false, inMonth = false, inVolume = false, inAuthorList = false, inAuthor =false, inLastName = false, inSuffix = false, inInitials = false, inMedlinePgn = false, inMedlineID = false, inURL=false, inIssue = false, inPubDate = false, inUrl=false, inForename=false, inAbstractText=false, inMedlineDate=false, inPubMedID=false, inDescriptorName=false,inDoi=false,inPii=false, inAffiliation=false, inMeshHeader=false, inQualifierName=false, inLanguage=false, inPst=false; String title="", journal="", keywords ="",author="", lastName="",suffix="",year="",forename="", abstractText="", affiliation=""; String month="",volume="",lastname="",initials="",number="",page="",medlineID="",url="",MedlineDate=""; String series="",editor="",booktitle="",type="article",key="",address="", pubmedid="",doi="",pii="", majorTopic = "", minorTopics = "", language = "", pst= ""; ArrayList<String> authors=new ArrayList<String>(); TreeSet<String> descriptors = new TreeSet<String>(); // To gather keywords int rowNum=0; private static final String KEYWORD_SEPARATOR = "; "; public ArrayList<BibtexEntry> getItems(){ return bibitems;} public MedlineHandler(){ super(); } public void startElement(String uri, String localName, String qName, Attributes atts) { // public void startElement(String localName, Attributes atts) { // Get the number of attribute if(localName.equals("PubmedArticle")){} else if(localName.equals("ArticleTitle")){ inTitle=true; title="";} else if(localName.equals("PubDate")){inPubDate=true;} else if(localName.equals("Year") && inPubDate==true){inYear=true;} else if( localName.equals("MedlineDate") && inPubDate==true){inMedlineDate=true;} // medline date does not have 4 digit dates instead it has multiyear etc else if(localName.equals("MedlineTA")){inJournal=true;journal="";} //journal name else if(localName.equals("Month") && inPubDate==true){inMonth=true;} else if(localName.equals("Volume")){inVolume=true;} else if(localName.equals("Language")){inLanguage=true;} else if(localName.equals("PublicationStatus")){inPst=true;} else if(localName.equals("AuthorList")){ inAuthorList=true; authors.clear();} else if (localName.equals("MeshHeading")) { inMeshHeader = true; majorTopic = ""; minorTopics = ""; } else if(localName.equals("DescriptorName")){ inDescriptorName=true; } else if (localName.equals("QualifierName")) { inQualifierName=true; } else if(localName.equals("Author")){inAuthor=true;author="";} else if(localName.equals("CollectiveName")){inForename=true;forename="";} // Morten A. 20040513. else if(localName.equals("PMID")){ // Set PMID only once, because there can be <CommentIn> tags later on that // contain IDs of different articles. if (pubmedid.length() == 0) { inPubMedID=true; pubmedid=""; } } else if(localName.equals("LastName")){inLastName=true; lastName="";} else if(localName.equals("ForeName") || localName.equals("FirstName")) { inForename=true; forename=""; } else if (localName.equals("Suffix")) { inSuffix = true; suffix = ""; } else if(localName.equals("Issue")){inIssue=true;} else if(localName.equals("MedlinePgn")){inMedlinePgn=true; }//pagenumber else if(localName.equals("URL")){inUrl=true;} else if(localName.equals("Initials")){inInitials=true;} else if(localName.equals("AbstractText")){ inAbstractText=true;} else if(localName.equals("ArticleId")){ for (int i = 0; i < atts.getLength(); i++) { String value = atts.getValue(i); if(value.equals("doi")) inDoi=true; else if(value.equals("pii")) inPii=true; } } else if(localName.equals("Affiliation")){ inAffiliation=true; } return; } String join(Object[] sa,String delim){ StringBuffer sb=new StringBuffer(); sb.append( sa[0].toString() ); for(int i=1; i<sa.length; i++) { sb.append( delim ); sb.append( sa[i].toString() ); } return sb.toString(); } String makeBibtexString(){ String out = ""; // PENDING jeffrey.kuhn@yale.edu 2005-05-27 : added call to fixPageRange out= "article{,\n" + " author = { " + author + " },\n title = { " + title + "},\n journal ={ " + journal + "},\n year = " + year + "},\n volume = { " + volume + "},\n number = { "+ number + "},\n pages = { " + fixPageRange(page) + "},\n abstract = { " + abstractText + "},\n}"; return out; } public void endElement( String uri, String localName, String qName ) { if(localName.equals("PubmedArticle")){ //bibitems.add( new Bibitem(null, makeBibtexString(), Globals.nextKey(),"-1" ) ); // check if year ="" then give medline date instead if(year.equals("")){ if(!MedlineDate.equals("")) { // multi-year date format //System.out.println(MedlineDate); year = MedlineDate.substring(0,4); //Matcher m = Pattern.compile("\\b[0-9]{4}\\b").matcher(MedlineDate); //if(m.matches()) //year = m.group(); } } // Build a string from the collected keywords: StringBuffer sb = new StringBuffer(); for (Iterator<String> iterator = descriptors.iterator(); iterator.hasNext();) { String s = iterator.next(); sb.append(s); if (iterator.hasNext()) sb.append(KEYWORD_SEPARATOR); } keywords = sb.toString(); BibtexEntry b=new BibtexEntry(Util.createNeutralId(),//Globals.DEFAULT_BIBTEXENTRY_ID, Globals.getEntryType("article")); // id assumes an existing database so don't create one here if (!author.equals("")) { b.setField("author",ImportFormatReader.expandAuthorInitials(author)); author = ""; } if (!title.equals("")) b.setField("title",title); if (!journal.equals("")) b.setField("journal",journal); if (!year.equals("")) b.setField("year",year); // PENDING jeffrey.kuhn@yale.edu 2005-05-27 : added call to fixPageRange if (!page.equals("")) b.setField("pages",fixPageRange(page)); if (!volume.equals("")) b.setField("volume",volume); if (!language.equals("")) b.setField("language",language); if (!pst.equals("")) b.setField("medline-pst", pst); if (!abstractText.equals("")) b.setField("abstract",abstractText.replaceAll("%","\\\\%")); if (!keywords.equals("")) b.setField("keywords",keywords); if (!month.equals("")) b.setField("month",month); //if (!url.equals("")) b.setField("url",url); if (!number.equals("")) b.setField("number",number); if(!doi.equals("")){ b.setField("doi",doi); b.setField("url","http://dx.doi.org/"+doi); } if(!pii.equals("")) b.setField("pii",pii); if(!affiliation.equals("")) { b.setField("institution",affiliation.replaceAll("#", "\\\\#")); } // PENDING jeffrey.kuhn@yale.edu 2005-05-27 : added "pmid" bibtex field // Older references do not have doi entries, but every // medline entry has a unique pubmed ID (aka primary ID). // Add a bibtex field for the pubmed ID for future use. if (!pubmedid.equals("")) b.setField("pmid",pubmedid); bibitems.add( b ); abstractText = ""; author = ""; title=""; journal=""; keywords =""; doi=""; pii=""; year=""; forename=""; lastName=""; suffix = ""; abstractText=""; affiliation=""; pubmedid=""; majorTopic = ""; minorTopics = ""; month="";volume="";language="";pst="";lastname=""; suffix=""; initials="";number="";page="";medlineID="";url=""; MedlineDate=""; descriptors.clear(); } else if(localName.equals("ArticleTitle")){inTitle=false;} else if(localName.equals("PubDate")){inPubDate=false;} else if(localName.equals("Year")){inYear=false;} else if(localName.equals("PMID")){inPubMedID=false;} else if(localName.equals("MedlineDate")){inMedlineDate=false;} else if(localName.equals("MedlineTA")){inJournal=false;} //journal name else if(localName.equals("Month")){inMonth=false;} else if(localName.equals("Volume")){inVolume=false;} else if(localName.equals("Language")){inLanguage=false;} else if(localName.equals("PublicationStatus")){inPst=false;} else if(localName.equals("AuthorList")){ author = join( authors.toArray(), " and " ); inAuthorList = false; } else if(localName.equals("Author")){ // forename sometimes has initials with " " in middle: is pattern [A-Z] [A-Z] // when above is the case replace it with initials if(forename.length()==3 && forename.charAt(1)==' '){ forename=initials; } // Put together name with last name first, and enter suffix in between if present: if (lastname.indexOf(" ") > 0) author = "{"+lastname+"}"; else author = lastname; if (suffix.length() > 0) author = author+", "+suffix; if (forename.length() > 0) author = author+", "+forename; //author = initials + " " + lastname; authors.add(author); inAuthor=false; forename = ""; initials = ""; lastname = ""; suffix = ""; } else if(localName.equals("DescriptorName")) inDescriptorName=false; else if(localName.equals("QualifierName")) inQualifierName=false; else if(localName.equals("MeshHeading")) { inMeshHeader = false; if (minorTopics.equals("")) descriptors.add(majorTopic); else descriptors.add(majorTopic+", "+minorTopics); } else if(localName.equals("LastName")){inLastName=false;} else if(localName.equals("Suffix")){inSuffix=false;} else if(localName.equals("ForeName")||localName.equals("FirstName")){ inForename=false;} else if(localName.equals("Issue")){ inIssue = false;} else if(localName.equals("MedlinePgn")){inMedlinePgn=false;}//pagenumber else if(localName.equals("URL")){ inUrl=false;} else if(localName.equals("Initials")){ //initials= '.' + initials + '.'; inInitials=false; } else if(localName.equals("AbstractText")){ inAbstractText=false;} else if(localName.equals("Affiliation")){ inAffiliation=false; } else if(localName.equals("ArticleId")){ if(inDoi) inDoi=false; else if(inPii) inPii=false;} } public void characters( char[] data, int start, int length ) { // if stack is not ready, data is not content of recognized element if( inTitle ){ title += new String( data, start, length);} else if(inYear){ year+=new String(data,start,length);} else if(inJournal){journal += new String(data,start,length);} else if(inMonth){month += new String(data,start,length);} else if(inVolume){volume += new String(data,start,length);} else if(inLanguage){language += new String(data,start,length).toLowerCase();} else if(inPst){pst += new String(data,start,length);} else if(inLastName){lastname += new String(data,start,length);} else if(inSuffix){suffix += new String(data,start,length);} else if(inInitials){initials += new String(data,start,length);} else if(inIssue){number += new String(data,start,length);} else if(inMedlinePgn){ page += new String(data,start,length);} else if(inMedlineID){medlineID += new String(data,start,length);} else if(inURL){url += new String(data,start,length);} else if(inPubMedID){pubmedid = new String(data,start,length);} else if(inQualifierName) { if (!minorTopics.equals("")) minorTopics = minorTopics+"/"; minorTopics = minorTopics + new String(data,start,length); } else if(inDescriptorName) { majorTopic = new String(data,start,length); } //keywords += new String(data,start,length) + ", "; else if(inForename){ forename += new String(data,start,length); //System.out.println("IN FORENAME: " + forename); } else if(inAbstractText){ abstractText += new String(data,start,length);} else if(inMedlineDate){ MedlineDate += new String(data,start,length);} else if(inDoi){ doi=new String(data,start,length);} else if(inPii){ pii=new String(data,start,length);} else if(inAffiliation){ affiliation = new String(data,start,length);} } // PENDING jeffrey.kuhn@yale.edu 2005-05-27 : added fixPageRange method // Convert medline page ranges from short form to full form. // Medline reports page ranges in a shorthand format. // The last page is reported using only the digits which // differ from the first page. // i.e. 12345-51 refers to the actual range 12345-12351 public String fixPageRange(String pageRange) { int minusPos = pageRange.indexOf('-'); if (minusPos < 0) { return pageRange; } String first = pageRange.substring(0, minusPos).trim(); String last = pageRange.substring(minusPos+1).trim(); int llast = last.length(), lfirst = first.length(); if (llast < lfirst) { last = first.substring(0, lfirst-llast) + last; } return first + "--" + last; } }