package org.jabref.logic.importer.fetcher;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.jabref.logic.formatter.bibtexfields.ClearFormatter;
import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter;
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.IdBasedParserFetcher;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.fileformat.MedlineImporter;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.cleanup.FieldFormatterCleanup;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.utils.URIBuilder;
/**
* Fetch or search from PubMed <a href="http://www.ncbi.nlm.nih.gov/sites/entrez/">www.ncbi.nlm.nih.gov</a>
* The MedlineFetcher fetches the entries from the PubMed database.
* See <a href="http://help.jabref.org/en/MedlineRIS">help.jabref.org</a> for a detailed documentation of the available fields.
*/
public class MedlineFetcher implements IdBasedParserFetcher, SearchBasedFetcher {
private static final Log LOGGER = LogFactory.getLog(MedlineFetcher.class);
private static final int NUMBER_TO_FETCH = 50;
private static final String ID_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi";
private static final String SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
private int numberOfResultsFound;
/**
* Replaces all commas in a given string with " AND "
*
* @param query input to remove commas
* @return input without commas
*/
private static String replaceCommaWithAND(String query) {
return query.replaceAll(", ", " AND ").replaceAll(",", " AND ");
}
/**
* When using 'esearch.fcgi?db=<database>&term=<query>' we will get a list of IDs matching the query.
* Input: Any text query (&term)
* Output: List of UIDs matching the query
*
* @see <a href="https://www.ncbi.nlm.nih.gov/books/NBK25500/">www.ncbi.nlm.nih.gov/books/NBK25500/</a>
*/
private List<String> getPubMedIdsFromQuery(String query) throws FetcherException {
boolean fetchIDs = false;
boolean firstOccurrenceOfCount = false;
List<String> idList = new ArrayList<>();
try {
URL ncbi = createSearchUrl(query);
XMLInputFactory inputFactory = XMLInputFactory.newFactory();
XMLStreamReader streamReader = inputFactory.createXMLStreamReader(ncbi.openStream());
fetchLoop: while (streamReader.hasNext()) {
int event = streamReader.getEventType();
switch (event) {
case XMLStreamConstants.START_ELEMENT:
if (streamReader.getName().toString().equals("Count")) {
firstOccurrenceOfCount = true;
}
if (streamReader.getName().toString().equals("IdList")) {
fetchIDs = true;
}
break;
case XMLStreamConstants.CHARACTERS:
if (firstOccurrenceOfCount) {
numberOfResultsFound = Integer.parseInt(streamReader.getText());
firstOccurrenceOfCount = false;
}
if (fetchIDs) {
idList.add(streamReader.getText());
}
break;
case XMLStreamConstants.END_ELEMENT:
//Everything relevant is listed before the IdList. So we break the loop right after the IdList tag closes.
if (streamReader.getName().toString().equals("IdList")) {
break fetchLoop;
}
}
streamReader.next();
}
streamReader.close();
return idList;
} catch (IOException | URISyntaxException e) {
throw new FetcherException("Unable to get PubMed IDs", Localization.lang("Unable to get PubMed IDs"), e);
} catch (XMLStreamException e) {
throw new FetcherException("Error while parsing ID list", Localization.lang("Error while parsing ID list"),
e);
}
}
@Override
public String getName() {
return "Medline/PubMed";
}
@Override
public HelpFile getHelpPage() {
return HelpFile.FETCHER_MEDLINE;
}
@Override
public URL getURLForID(String identifier) throws URISyntaxException, MalformedURLException, FetcherException {
URIBuilder uriBuilder = new URIBuilder(ID_URL);
uriBuilder.addParameter("db", "pubmed");
uriBuilder.addParameter("retmode", "xml");
uriBuilder.addParameter("id", identifier);
return uriBuilder.build().toURL();
}
@Override
public Parser getParser() {
return new MedlineImporter();
}
@Override
public void doPostCleanup(BibEntry entry) {
new FieldFormatterCleanup("journal-abbreviation", new ClearFormatter()).cleanup(entry);
new FieldFormatterCleanup("status", new ClearFormatter()).cleanup(entry);
new FieldFormatterCleanup("copyright", new ClearFormatter()).cleanup(entry);
new FieldFormatterCleanup(FieldName.MONTH, new NormalizeMonthFormatter()).cleanup(entry);
}
@Override
public List<BibEntry> performSearch(String query) throws FetcherException {
List<BibEntry> entryList = new LinkedList<>();
if (query.isEmpty()) {
return Collections.emptyList();
} else {
String searchTerm = replaceCommaWithAND(query);
//searching for pubmed ids matching the query
List<String> idList = getPubMedIdsFromQuery(searchTerm);
if (idList.isEmpty()) {
LOGGER.info("No results found.");
return Collections.emptyList();
}
if (numberOfResultsFound > NUMBER_TO_FETCH) {
LOGGER.info(
numberOfResultsFound + " results found. Only 50 relevant results will be fetched by default.");
}
//pass the list of ids to fetchMedline to download them. like a id fetcher for mutliple ids
entryList = fetchMedline(idList);
return entryList;
}
}
private URL createSearchUrl(String term) throws URISyntaxException, MalformedURLException {
term = replaceCommaWithAND(term);
URIBuilder uriBuilder = new URIBuilder(SEARCH_URL);
uriBuilder.addParameter("db", "pubmed");
uriBuilder.addParameter("sort", "relevance");
uriBuilder.addParameter("retmax", String.valueOf(NUMBER_TO_FETCH));
uriBuilder.addParameter("term", term);
return uriBuilder.build().toURL();
}
/**
* Fetch and parse an medline item from eutils.ncbi.nlm.nih.gov.
* The E-utilities generate a huge XML file containing all entries for the ids
*
* @param ids A list of IDs to search for.
* @return Will return an empty list on error.
*/
private List<BibEntry> fetchMedline(List<String> ids) throws FetcherException {
try {
//Separate the IDs with a comma to search multiple entries
URL fetchURL = getURLForID(String.join(",", ids));
URLConnection data = fetchURL.openConnection();
ParserResult result = new MedlineImporter().importDatabase(
new BufferedReader(new InputStreamReader(data.getInputStream(), StandardCharsets.UTF_8)));
if (result.hasWarnings()) {
LOGGER.warn(result.getErrorMessage());
}
List<BibEntry> resultList = result.getDatabase().getEntries();
resultList.forEach(this::doPostCleanup);
return resultList;
} catch (URISyntaxException | MalformedURLException e) {
throw new FetcherException("Error while generating fetch URL",
Localization.lang("Error while generating fetch URL"), e);
} catch (IOException e) {
throw new FetcherException("Error while fetching from Medline",
Localization.lang("Error while fetching from %0", "Medline"), e);
}
}
}