package org.jabref.logic.importer.fetcher; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.jabref.logic.formatter.bibtexfields.ClearFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter; import org.jabref.logic.help.HelpFile; import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.IdBasedParserFetcher; import org.jabref.logic.importer.Parser; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.importer.SearchBasedFetcher; import org.jabref.logic.importer.fileformat.MedlineImporter; import org.jabref.logic.l10n.Localization; import org.jabref.model.cleanup.FieldFormatterCleanup; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.client.utils.URIBuilder; /** * Fetch or search from PubMed <a href="http://www.ncbi.nlm.nih.gov/sites/entrez/">www.ncbi.nlm.nih.gov</a> * The MedlineFetcher fetches the entries from the PubMed database. * See <a href="http://help.jabref.org/en/MedlineRIS">help.jabref.org</a> for a detailed documentation of the available fields. */ public class MedlineFetcher implements IdBasedParserFetcher, SearchBasedFetcher { private static final Log LOGGER = LogFactory.getLog(MedlineFetcher.class); private static final int NUMBER_TO_FETCH = 50; private static final String ID_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"; private static final String SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"; private int numberOfResultsFound; /** * Replaces all commas in a given string with " AND " * * @param query input to remove commas * @return input without commas */ private static String replaceCommaWithAND(String query) { return query.replaceAll(", ", " AND ").replaceAll(",", " AND "); } /** * When using 'esearch.fcgi?db=<database>&term=<query>' we will get a list of IDs matching the query. * Input: Any text query (&term) * Output: List of UIDs matching the query * * @see <a href="https://www.ncbi.nlm.nih.gov/books/NBK25500/">www.ncbi.nlm.nih.gov/books/NBK25500/</a> */ private List<String> getPubMedIdsFromQuery(String query) throws FetcherException { boolean fetchIDs = false; boolean firstOccurrenceOfCount = false; List<String> idList = new ArrayList<>(); try { URL ncbi = createSearchUrl(query); XMLInputFactory inputFactory = XMLInputFactory.newFactory(); XMLStreamReader streamReader = inputFactory.createXMLStreamReader(ncbi.openStream()); fetchLoop: while (streamReader.hasNext()) { int event = streamReader.getEventType(); switch (event) { case XMLStreamConstants.START_ELEMENT: if (streamReader.getName().toString().equals("Count")) { firstOccurrenceOfCount = true; } if (streamReader.getName().toString().equals("IdList")) { fetchIDs = true; } break; case XMLStreamConstants.CHARACTERS: if (firstOccurrenceOfCount) { numberOfResultsFound = Integer.parseInt(streamReader.getText()); firstOccurrenceOfCount = false; } if (fetchIDs) { idList.add(streamReader.getText()); } break; case XMLStreamConstants.END_ELEMENT: //Everything relevant is listed before the IdList. So we break the loop right after the IdList tag closes. if (streamReader.getName().toString().equals("IdList")) { break fetchLoop; } } streamReader.next(); } streamReader.close(); return idList; } catch (IOException | URISyntaxException e) { throw new FetcherException("Unable to get PubMed IDs", Localization.lang("Unable to get PubMed IDs"), e); } catch (XMLStreamException e) { throw new FetcherException("Error while parsing ID list", Localization.lang("Error while parsing ID list"), e); } } @Override public String getName() { return "Medline/PubMed"; } @Override public HelpFile getHelpPage() { return HelpFile.FETCHER_MEDLINE; } @Override public URL getURLForID(String identifier) throws URISyntaxException, MalformedURLException, FetcherException { URIBuilder uriBuilder = new URIBuilder(ID_URL); uriBuilder.addParameter("db", "pubmed"); uriBuilder.addParameter("retmode", "xml"); uriBuilder.addParameter("id", identifier); return uriBuilder.build().toURL(); } @Override public Parser getParser() { return new MedlineImporter(); } @Override public void doPostCleanup(BibEntry entry) { new FieldFormatterCleanup("journal-abbreviation", new ClearFormatter()).cleanup(entry); new FieldFormatterCleanup("status", new ClearFormatter()).cleanup(entry); new FieldFormatterCleanup("copyright", new ClearFormatter()).cleanup(entry); new FieldFormatterCleanup(FieldName.MONTH, new NormalizeMonthFormatter()).cleanup(entry); } @Override public List<BibEntry> performSearch(String query) throws FetcherException { List<BibEntry> entryList = new LinkedList<>(); if (query.isEmpty()) { return Collections.emptyList(); } else { String searchTerm = replaceCommaWithAND(query); //searching for pubmed ids matching the query List<String> idList = getPubMedIdsFromQuery(searchTerm); if (idList.isEmpty()) { LOGGER.info("No results found."); return Collections.emptyList(); } if (numberOfResultsFound > NUMBER_TO_FETCH) { LOGGER.info( numberOfResultsFound + " results found. Only 50 relevant results will be fetched by default."); } //pass the list of ids to fetchMedline to download them. like a id fetcher for mutliple ids entryList = fetchMedline(idList); return entryList; } } private URL createSearchUrl(String term) throws URISyntaxException, MalformedURLException { term = replaceCommaWithAND(term); URIBuilder uriBuilder = new URIBuilder(SEARCH_URL); uriBuilder.addParameter("db", "pubmed"); uriBuilder.addParameter("sort", "relevance"); uriBuilder.addParameter("retmax", String.valueOf(NUMBER_TO_FETCH)); uriBuilder.addParameter("term", term); return uriBuilder.build().toURL(); } /** * Fetch and parse an medline item from eutils.ncbi.nlm.nih.gov. * The E-utilities generate a huge XML file containing all entries for the ids * * @param ids A list of IDs to search for. * @return Will return an empty list on error. */ private List<BibEntry> fetchMedline(List<String> ids) throws FetcherException { try { //Separate the IDs with a comma to search multiple entries URL fetchURL = getURLForID(String.join(",", ids)); URLConnection data = fetchURL.openConnection(); ParserResult result = new MedlineImporter().importDatabase( new BufferedReader(new InputStreamReader(data.getInputStream(), StandardCharsets.UTF_8))); if (result.hasWarnings()) { LOGGER.warn(result.getErrorMessage()); } List<BibEntry> resultList = result.getDatabase().getEntries(); resultList.forEach(this::doPostCleanup); return resultList; } catch (URISyntaxException | MalformedURLException e) { throw new FetcherException("Error while generating fetch URL", Localization.lang("Error while generating fetch URL"), e); } catch (IOException e) { throw new FetcherException("Error while fetching from Medline", Localization.lang("Error while fetching from %0", "Medline"), e); } } }