/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.pubmed; import java.io.IOException; import java.util.List; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParserFactory; import org.apache.http.HttpStatus; import org.carrot2.core.Document; import org.carrot2.core.LanguageCode; import org.carrot2.core.attribute.Init; import org.carrot2.core.attribute.Internal; import org.carrot2.core.attribute.Processing; import org.carrot2.source.SearchEngineResponse; import org.carrot2.source.SimpleSearchEngine; import org.carrot2.util.StringUtils; import org.carrot2.util.attribute.Attribute; import org.carrot2.util.attribute.AttributeLevel; import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.DefaultGroups; import org.carrot2.util.attribute.Group; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Label; import org.carrot2.util.attribute.Level; import org.carrot2.util.attribute.constraint.IntRange; import org.carrot2.util.httpclient.HttpClientFactory; import org.carrot2.util.httpclient.HttpRedirectStrategy; import org.carrot2.util.httpclient.HttpUtils; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; /** * Performs searches on the PubMed database using its on-line e-utilities: * http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html */ @Bindable(prefix = "PubMedDocumentSource") public class PubMedDocumentSource extends SimpleSearchEngine { /** PubMed search service URL */ public static final String E_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"; /** PubMed fetch service URL */ public static final String E_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"; /** HTTP timeout for pubmed services.*/ public static final int PUBMED_TIMEOUT = HttpClientFactory.DEFAULT_TIMEOUT * 3; /** * Tool name, if registered. * @see "http://www.ncbi.nlm.nih.gov" */ @Init @Input @Attribute @Label("EUtils Registered Tool Name") @Level(AttributeLevel.ADVANCED) @Group(DefaultGroups.QUERY) public String toolName = "Carrot Search"; /** * Maximum results to fetch. No more than the specified number of results * will be fetched from PubMed, regardless of the requested number of results. */ @Processing @Input @Attribute @IntRange(min = 1) @Internal(configuration = true) @Label("Maximum results") @Level(AttributeLevel.ADVANCED) @Group(DefaultGroups.QUERY) public int maxResults = 150; /** * HTTP redirect response strategy (follow or throw an error). */ @Input @Processing @Attribute @Label("HTTP redirect strategy") @Level(AttributeLevel.MEDIUM) @Group(SimpleSearchEngine.SERVICE) @Internal public HttpRedirectStrategy redirectStrategy = HttpRedirectStrategy.NO_REDIRECTS; @Override protected SearchEngineResponse fetchSearchResponse() throws Exception { PubMedIdSearchHandler idResponse = getPubMedIds(query, results); SearchEngineResponse response = getPubMedAbstracts(idResponse.getPubMedPrimaryIds()); response.metadata.put(SearchEngineResponse.RESULTS_TOTAL_KEY, idResponse.getMatchCount()); return response; } @Override protected void afterFetch(SearchEngineResponse response) { for (Document document : response.results) { document.setLanguage(LanguageCode.ENGLISH); } } /** * Gets PubMed entry ids matching the query. */ private PubMedIdSearchHandler getPubMedIds(final String query, final int requestedResults) throws Exception { final XMLReader reader = newXmlReader(); PubMedIdSearchHandler searchHandler = new PubMedIdSearchHandler(); reader.setContentHandler(searchHandler); final String url = E_SEARCH_URL + "?db=pubmed" + "&usehistory=n&" + "&term=" + StringUtils.urlEncodeWrapException(query, "UTF-8") + "&retmax=" + Integer.toString(Math.min(requestedResults, maxResults)) + "&tool=" + StringUtils.urlEncodeWrapException(toolName, "UTF-8"); final HttpUtils.Response response = HttpUtils.doGET( url, null, null, null, null, PUBMED_TIMEOUT, redirectStrategy.value()); // Get document IDs if (response.status == HttpStatus.SC_OK) { reader.parse(new InputSource(response.getPayloadAsStream())); } else { throw new IOException("PubMed returned HTTP Error: " + response.status + ", HTTP payload: " + new String(response.payload, "iso8859-1")); } return searchHandler; } /** * Gets PubMed abstracts corresponding to the provided ids. */ private SearchEngineResponse getPubMedAbstracts(List<String> ids) throws Exception { if (ids.isEmpty()) { return new SearchEngineResponse(); } final XMLReader reader = newXmlReader(); final PubMedContentHandler fetchHandler = new PubMedContentHandler(); reader.setContentHandler(fetchHandler); final String url = E_FETCH_URL + "?db=pubmed" + "&retmode=xml" + "&rettype=abstract" + "&id=" + getIdsString(ids) + "&tool=" + StringUtils.urlEncodeWrapException(toolName, "UTF-8"); final HttpUtils.Response response = HttpUtils.doGET( url, null, null, null, null, PUBMED_TIMEOUT, redirectStrategy.value()); // Get document contents // No URL logging here, as the url can get really long if (response.status == HttpStatus.SC_OK) { reader.parse(new InputSource(response.getPayloadAsStream())); } else { throw new IOException("PubMed returned HTTP Error: " + response.status + ", HTTP payload: " + new String(response.payload, "iso8859-1")); } return fetchHandler.getResponse(); } static XMLReader newXmlReader() throws SAXException, ParserConfigurationException { XMLReader reader = SAXParserFactory.newInstance() .newSAXParser() .getXMLReader(); reader.setFeature("http://xml.org/sax/features/validation", false); reader.setFeature("http://xml.org/sax/features/namespaces", true); reader.setEntityResolver(new EmptyEntityResolver()); return reader; } private String getIdsString(List<String> ids) { final StringBuilder buf = new StringBuilder(); for (String id : ids) { buf.append(id); buf.append(","); } if (buf.length() > 0) { return buf.substring(0, buf.length() - 1); } else { return ""; } } }