package org.molgenis.ontology.ic; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.regex.Matcher; import java.util.regex.Pattern; public class PubMedTermFrequencyService { private static final Logger LOG = LoggerFactory.getLogger(PubMedTermFrequencyService.class); private static final String BASE_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?term="; private static final Pattern PATTERN_REGEX = Pattern.compile("<Count>(\\d*)</Count>"); private static final int TIME_OUT = 20000; private static final long TOTAL_NUMBER_PUBLICATION = 24000000; public PubMedTFEntity getTermFrequency(String term) { String response = httpGet(BASE_URL + "\"" + term + "\""); return parseResponse(response); } public PubMedTFEntity parseResponse(String response) { Matcher matcher = PATTERN_REGEX.matcher(response); if (matcher.find()) { String countString = matcher.group(1); if (StringUtils.isNotEmpty(countString)) { int occurrence = Integer.parseInt(countString); if (occurrence != 0) { double frequency = Math.abs(Math.log10((double) occurrence / TOTAL_NUMBER_PUBLICATION)); return new PubMedTFEntity(occurrence, frequency); } } } return null; } public String httpGet(String targetURL) { HttpURLConnection connection = null; try { URL url = new URL(targetURL); connection = (HttpURLConnection) url.openConnection(); connection.setRequestMethod("GET"); connection.setUseCaches(false); connection.setDoOutput(true); connection.setConnectTimeout(TIME_OUT); DataOutputStream wr = new DataOutputStream(connection.getOutputStream()); wr.close(); InputStream is = connection.getInputStream(); BufferedReader rd = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); StringBuilder response = new StringBuilder(); String line; while ((line = rd.readLine()) != null) { response.append(line); response.append('\r'); } rd.close(); return response.toString(); } catch (Exception e) { LOG.error(e.getMessage()); return StringUtils.EMPTY; } } }