/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.medline;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.WriteTextFile;
public class FetchPMIDsFromOnlinePubmed {
public static int maxAttempts = 10;
public static String response;
private static long lastQueryTime = 0;
private static long extraDelay = 0;
private static int minWaitTime = 1000; //Wait for 1 second to do next query
public static int batchSize = 100000; //Seems like bug is finally fixed. (Temporarily set to 1000, but should be 100000 when NLM fixes bug in their service)
public static int maxReturn = Integer.MAX_VALUE;
public static void main(String[] args){
//Test:
minWaitTime = 1000;
batchSize = 100000;
savePMIDs("breast cancer", "/Users/mulligen/Desktop/resultPMIDS.txt","e.vanmulligen@erasmusmc.nl");
// List<Integer> pmids = getPMIDs("breast cancer", "m.schuemie@erasmusmc.nl");
// System.out.println(pmids.size());
// Set<Integer> uniquePMIDs = new HashSet<Integer>(pmids);
// System.out.println(uniquePMIDs.size());
}
/**
* Helper function. Gets PMIDs from PubMed, then saves them to the file.
* @param query
* @param email
* @param filename
*/
public static void savePMIDs(String query, String filename, String email){
List<Integer> pmids = getPMIDs(query, email);
WriteTextFile out = new WriteTextFile(filename);
for (Integer pmid : pmids)
out.writeln(pmid);
out.close();
}
/**
*
* @param query The Pubmed query (e.g. 'Schuemie MJ[Author]')
* @return A list of PMIDs matching your searching criteria
*/
public static List<Integer> getPMIDs(String query, String email){
boolean done = false;
List<Integer> pmids = new ArrayList<Integer>();
while (!done){
String url = generateULR(pmids.size(), query, email);
System.out.println("Sending query to Pubmed: " + url);
String response = getHTML(url);
done = parseResponse(response, pmids);
}
return pmids;
}
private static String generateULR(int offset, String query, String email) {
StringBuffer url = new StringBuffer();
url.append("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=");
url.append(batchSize);
if (offset != 0){
url.append("&retstart=");
url.append(offset);
}
url.append("&term=");
url.append(query.replace(" ", "+"));
url.append("&email=");
url.append(email);
return url.toString();
}
private static boolean parseResponse(String response, List<Integer> pmids) {
String[] lines = response.split("\n");
boolean ids = false;
boolean haveCount = false;
int count = 0;
for (String line : lines){
if (line.contains("<OutputMessage>")){
System.err.println(line.trim());
return true;
} else if (!haveCount && line.contains("<Count>")) {
count = Integer.parseInt(StringUtilities.findBetween(line, "<Count>", "</Count>"));
haveCount = true;
}
if (ids){
if (line.contains("</IdList>"))
ids = false;
else {
String pmid = StringUtilities.findBetween(line, "<Id>", "</Id>");
try {
pmids.add(Integer.parseInt(pmid));
} catch (NumberFormatException e){
System.err.println(e.getMessage() + ", Problem parsing PMID: " + line);
return true;
}
}
}
if (line.contains("<IdList>"))
ids = true;
}
return (pmids.size() == count || pmids.size() >= maxReturn);
}
private static String getHTML(String url) {
HttpClient client = new HttpClient();
GetMethod method = new GetMethod(url);
method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false));
method.addRequestHeader("Content-Type", "application/x-www-form-urlencoded; charset=ISO-8559-1");
response = "";
try {
int statusCode;
int attempts = 0;
do {
checkWaitTime();
statusCode = client.executeMethod(method);
response = method.getResponseBodyAsString();
resetWaitTime();
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: " + method.getStatusLine());
} else if (response.contains("<ERROR>")){
System.out.println(response);
statusCode = HttpStatus.SC_PARTIAL_CONTENT;
extraDelay = 60000; //wait minute extra
}
attempts++;
} while (statusCode != HttpStatus.SC_OK && attempts <= maxAttempts);
if (attempts > maxAttempts)
System.err.println("Failed after " + attempts + " attempts on URL: " + url);
} catch (HttpException e) {
System.err.println("Fatal protocol violation: " + e.getMessage());
e.printStackTrace();
} catch (IOException e) {
System.err.println("Fatal transport error: " + e.getMessage());
e.printStackTrace();
} finally {
method.releaseConnection();
}
return response;
}
private static void resetWaitTime() {
lastQueryTime = System.currentTimeMillis();
}
private static void checkWaitTime() {
long timePassed = System.currentTimeMillis() - lastQueryTime;
if (timePassed < minWaitTime + extraDelay){
try {
System.out.println("Waiting to send next query");
Thread.sleep(minWaitTime + extraDelay - timePassed);
extraDelay = 0;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}