/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.medline;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.erasmusmc.utilities.StringUtilities;
public class FetchRecordsFromOnlinePubmed {
private static int batchSize = 100;
private static long lastQueryTime = 0;
private static int minWaitTime = 5000; //Wait for 5 seconds to do next query
public static List<MedlineRecord> getRecords(List<Integer> pmids){
List<MedlineRecord> result = new ArrayList<MedlineRecord>();
int offset = 0;
boolean done = false;
do{
List<Integer> subset = new ArrayList<Integer>(batchSize);
if (offset >= pmids.size())
done = true;
else{
subset.addAll(pmids.subList(offset, Math.min(offset+batchSize, pmids.size())));
result.addAll(fetchSubset(subset));
offset += batchSize;
}
} while (!done);
return result;
}
private static List<MedlineRecord> fetchSubset(List<Integer> pmids) {
long timePassed = System.currentTimeMillis() - lastQueryTime;
if (timePassed < minWaitTime){
try {
System.out.println("Waiting to send next query");
Thread.sleep(minWaitTime - timePassed);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
StringBuffer url = new StringBuffer();
url.append("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&mode=xml&id=");
Iterator<Integer> pmidIterator = pmids.iterator();
while (pmidIterator.hasNext()){
url.append(pmidIterator.next().toString());
if (pmidIterator.hasNext())
url.append(",");
}
List<MedlineRecord> result = new ArrayList<MedlineRecord>();
//System.out.println(url.toString());
HttpClient client = new HttpClient();
GetMethod method = new GetMethod(url.toString());
method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false));
method.addRequestHeader(
"Content-Type", "application/x-www-form-urlencoded; charset=ISO-8559-1");
try {
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: " + method.getStatusLine());
}
String responseBody = method.getResponseBodyAsString();
//System.out.println(responseBody);
/*
byte[] bytes = new byte[1024];
StringBuffer response = new StringBuffer();
int len;
while ((len = responseBody.read(bytes)) != -1) {
for (int i = 0; i < len; i++)
response.append((char)bytes[i]);
}
responseBody.close();
*/
String[] lines = responseBody.split("\n");
MedlineRecord newRecord = null;
String lastName = "";
String firstName = "";
String collectiveName = "";
boolean primaryPMID = false;
for (String line : lines){
String trimLine = line.trim();
if (trimLine.startsWith("<PubmedArticle>")){
primaryPMID = true;
} else if (trimLine.startsWith("<PMID>") && primaryPMID){
newRecord = new MedlineRecord(Integer.parseInt(StringUtilities.findBetween(trimLine, "<PMID>", "</PMID>")));
result.add(newRecord);
primaryPMID = false;
} else if (trimLine.startsWith("<ArticleTitle>")){
newRecord.title = StringUtilities.findBetween(trimLine, "<ArticleTitle>", "</ArticleTitle>");
} else if (trimLine.startsWith("<AbstractText>")){
newRecord.abstractTexts.add(StringUtilities.findBetween(trimLine, "<AbstractText>", "</AbstractText>"));
//there could be more than 1 abstractText in an pubmed record(like PMID: 9459395),
//so just take the first abstractText as the main abstractText.
if (newRecord.abstractTexts.size() == 1) {
newRecord.abstractText = newRecord.abstractTexts.get(0);
}
} else if (trimLine.startsWith("<Title>")){
newRecord.journal = StringUtilities.findBetween(trimLine, "<Title>", "</Title>");
} else if (trimLine.startsWith("<PublicationType>")){
newRecord.publicationType.add(StringUtilities.findBetween(trimLine, "<PublicationType>", "</PublicationType>"));
} else if (trimLine.startsWith("<MedlineTA>")){
newRecord.journalShortForm = StringUtilities.findBetween(trimLine, "<MedlineTA>", "</MedlineTA>");
} else if (trimLine.startsWith("<Language>")){
newRecord.language = StringUtilities.findBetween(trimLine, "<Language>", "</Language>").toLowerCase();
} else if (trimLine.startsWith("<LastName>")){
lastName = StringUtilities.findBetween(trimLine, "<LastName>", "</LastName>");
} else if (trimLine.startsWith("<ForeName>")){
firstName = StringUtilities.findBetween(trimLine, "<ForeName>", "</ForeName>");
} else if (trimLine.startsWith("<CollectiveName>")){
collectiveName = StringUtilities.findBetween(trimLine, "<CollectiveName>", "</CollectiveName>");
} else if (trimLine.startsWith("</Author>")){
StringBuilder sb = new StringBuilder();
if (!firstName.equals("")){
sb.append(firstName.charAt(0));
sb.append(". ");
}
sb.append(lastName);
if (sb.length() != 0)
newRecord.authors.add(sb.toString());
else
newRecord.authors.add(collectiveName);
lastName = "";
firstName = "";
collectiveName = "";
} else if (trimLine.startsWith("<DescriptorName ")){
MeSHHeader header = new MeSHHeader();
header.descriptor = StringUtilities.findBetween(trimLine, ">", "</DescriptorName>");
newRecord.meshHeaders.add(header);
}
}
} catch (HttpException e) {
System.err.println("Fatal protocol violation: " + e.getMessage());
e.printStackTrace();
} catch (IOException e) {
System.err.println("Fatal transport error: " + e.getMessage());
e.printStackTrace();
} finally {
method.releaseConnection();
}
lastQueryTime = System.currentTimeMillis();
return result;
}
}