package org.mindinformatics.services.connector.pubmed.dataaccess;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import javax.xml.XMLConstants;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.sax.SAXSource;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.mindinformatics.services.connector.pubmed.fetch.PubmedArticle;
import org.mindinformatics.services.connector.pubmed.fetch.PubmedArticleSet;
import org.mindinformatics.services.connector.pubmed.search.ESearchResult;
import org.mindinformatics.services.connector.pubmed.search.Id;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
/**
* @author Paolo Ciccarese <paolo.ciccarese@gmail.com>
*/
public class PubmedSearchAgent {
// private static String ENCODING = "ISO-8859-1"; //"ISO-8859-1"
private static final Log logger = LogFactory
.getLog(PubmedSearchAgent.class);
private static String BASE_SEARCH_URL =
"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=";
private static String BASE_FETCH_URL =
"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=";
private static String SEARCH_PACKAGE_NAME =
"org.mindinformatics.services.connector.pubmed.search";
private static String FETCH_PACKAGE_NAME =
"org.mindinformatics.services.connector.pubmed.fetch";
private static JAXBContext searchJaxbContext = null;
private static Unmarshaller searchUnmarshaller = null;
private static JAXBContext fetchJaxbContext = null;
private static Unmarshaller fetchUnmarshaller = null;
private String proxyIp;
private String proxyPort;
private static PubmedSearchAgent instance = null;
// ------------------------------------------------------------------------
// Singleton and Initialization
// ------------------------------------------------------------------------
private PubmedSearchAgent() {
try {
searchJaxbContext = JAXBContext.newInstance(SEARCH_PACKAGE_NAME);
searchUnmarshaller = searchJaxbContext.createUnmarshaller();
fetchJaxbContext = JAXBContext.newInstance(FETCH_PACKAGE_NAME);
fetchUnmarshaller = fetchJaxbContext.createUnmarshaller();
} catch (JAXBException e) {
throw new RuntimeException(e);
}
}
public void setProxyIp(String proxyIp) {
this.proxyIp = proxyIp;
}
public void setProxyPort(String proxyPort) {
this.proxyPort = proxyPort;
}
public static synchronized PubmedSearchAgent getInstance() {
if (instance == null) {
instance = new PubmedSearchAgent();
}
return instance;
}
// ------------------------------------------------------------------------
/**
* Returns the PubMed records corresponding to the list of requested PubMed
* identifiers
* @param pmids The list of PubMed identifiers
* @return The correspondent PubMed records
*/
public PubmedArticleSet fetchPubmedDocuments(List<String> pmids) {
String url = BASE_FETCH_URL + join(",", pmids) + "&retmode=xml";
logger.info("fetchurl = " + url);
return (PubmedArticleSet) unmarshall(url, fetchUnmarshaller);
}
/**
* Fetching metadata from the PubMed service.
* @param query The query
* @param range The number of results to return
* @param offset The offset
* @return The metadata of the PubMed entries
*/
public PubmedArticleSet fetch(String query, int range, int offset) {
ESearchResult esResult = search(query, range, offset);
if (esResult == null || esResult.getCount().getContent().equals("0")) {
return null;
}
List<String> pmidStrings = new ArrayList<String>();
for (Id currentId : esResult.getIdList().getId()) {
pmidStrings.add(currentId.getContent());
}
return fetchPubmedDocuments(pmidStrings);
}
/**
* Fetching metadata from the PubMed service and returns also stats that can
* be used for pagination
* @param query The query
* @param range The number of results to return
* @param offset The offset
* @return The metadata of the PubMed entries and the statistics
*/
public Map<Integer,PubmedArticleSet> fetchWithStats(String query, int range, int offset) {
logger.info("Query: " + query);
ESearchResult esResult = search(query, range, offset);
logger.info("Results count: " + esResult.getCount().getContent());
if (esResult == null || esResult.getCount().getContent().equals("0")) {
return null;
}
logger.info("Results #ids: " + esResult.getIdList().getId().size());
List<String> pmidStrings = new ArrayList<String>();
for (Id currentId : esResult.getIdList().getId()) {
pmidStrings.add(currentId.getContent());
}
Map<Integer,PubmedArticleSet> map = new HashMap<Integer,PubmedArticleSet>();
map.put(Integer.parseInt(esResult.getCount().getContent()), fetchPubmedDocuments(pmidStrings));
return map;
}
public ESearchResult search(String query, int maxResults, int start) {
String url = BASE_SEARCH_URL + query + "&retmax=" + maxResults + "&retstart=" + start;
logger.info("Search url = " + url);
return (ESearchResult) this.unmarshall(url, searchUnmarshaller);
}
private Object unmarshall(String url, Unmarshaller unmarshaller) {
Object result = null;
try {
if(proxyIp!=null && proxyIp.trim().length()>3 && proxyPort!=null && proxyPort.trim().length()>1) {
logger.info("proxy: " + proxyIp + "-" + new Integer(proxyPort)) ;
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyIp, new Integer(proxyPort)));
HttpURLConnection connectionWithProxy = (HttpURLConnection) new URL(url).openConnection(proxy);
connectionWithProxy.setRequestProperty("Content-type", "text/xml");
connectionWithProxy.setRequestProperty("Accept", "text/xml, application/xml");
connectionWithProxy.setRequestMethod("GET");
connectionWithProxy.connect();
SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
spf.setFeature("http://apache.org/xml/features/validation/schema", false);
spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
XMLReader xmlReader = spf.newSAXParser().getXMLReader();
InputSource theInputSource = new InputSource((connectionWithProxy.getInputStream()));
SAXSource source = new SAXSource(xmlReader, theInputSource);
logger.info("InputSource: " + theInputSource);
result = unmarshaller.unmarshal(source);
} else {
logger.info("No proxy detected");
InputStreamReader inputStreamReader = new InputStreamReader((new java.net.URL(url)).openStream(), "utf-8");
BufferedReader theReader = new BufferedReader(inputStreamReader);
InputSource theInputSource = new InputSource(theReader);
result = unmarshaller.unmarshal(theInputSource);
}
} catch (Exception e) {
logger.error(e.getMessage());
throw new RuntimeException(e);
}
return result;
}
private static String join(String delim, java.util.List<String> idList) {
StringBuilder builder = new StringBuilder();
Iterator<String> iter = idList.iterator();
while (iter.hasNext()) {
builder.append(iter.next());
if (iter.hasNext()) {
builder.append(delim);
}
}
return builder.toString();
}
// TODO This is for testing and it has to be sanitized
public static void main(String[] args) {
String query = "semantic web";
try {
PubmedSearchAgent agent = new PubmedSearchAgent();
PubmedArticleSet articleSet = agent.fetch(query, 20, 0);
List<PubmedArticle> docs = articleSet.getPubmedArticle();
PubmedArticle pubmedArticle = null;
ListIterator<PubmedArticle> it = docs.listIterator();
logger.info("count = " + docs.size());
while (it.hasNext()) {
pubmedArticle = (PubmedArticle) it.next();
String abst = "";
try {
abst = pubmedArticle.getMedlineCitation().getArticle()
.getAbstract().getAbstractText().getContent();
} catch (NullPointerException e) {
}
logger.info(abst);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}