package uk.ac.ebi.ep.centralservice.helper; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Utility class for the Enzyme Portal. Its main use is text processing. * $Author$ */ public class EPUtil { private EPUtil() { } private static final String REGEXP_HOW = "(?: with low affinity| competitively)?"; private static final String REGEXP_PROHIB = "(?!(?:auto|de)?phosphorylation|(?:de)?acetylation|substrate|the " + "|[\\w\\s-]+?(?:binding|cleavage|phosphorylation))"; private static final String REGEXP_REL_CONC = "(?:(?:.+?) (?:concentration|level)s? of |\\d+% )?"; private static final String REGEXP_COMP_GRP = "(?:thiol-specific compounds |reductants such as |.*?antibiotics " + "|.*?drugs? |.*?ions, such as " + "|various compounds, including )?"; // "(?:.*?(?:compound|reductant|antibiotic|drug|ion)s?" + // "(?: |,? such as |,? including ))?"; private static final String REGEXP_EXPLANATION = "(?:,? an? .+?|,? whose .+?|,? which .+?|,? irrespective .+?" + "| as .+?| to .+?| with .+?| at .+?| concentrations .+?" + "|, this [^\\.]+| during .+?|(?: while)? in .+?| binding)?"; private static final String INH_REGEXP_OTHERS_LESS = ",? (?:and(?! activated by)|as well as)" + "(?:, to a (?:lower|lesser) extent, by| slightly inhibited by| by)?"; private static final String ACT_REGEXP_OTHERS_LESS = ",? (?:and(?! in(?:hibited|activated) by)|as well as)" + "(?:, to a (?:lower|lesser) extent, by| slightly activated by| by)?"; private static final String INH_REGEXP_DONT_CONTINUE = "(?! (?:and )?activated by| but not | the )"; private static final String ACT_REGEXP_DONT_CONTINUE = "(?! (?:and )?in(?:hibited|activated) by| but not | the )"; private static final String INH_REGEXP_END = "(?:\\.?$|[\\.;:] |,? but not " + "|,?(?: and)?(?: \\w+ly)? activated by)"; private static final String ACT_REGEXP_END = "(?:\\.?$|[\\.;:] |,? but not " + "|,?(?: and)?(?: \\w+ly)? in(?:hibited|activated) by)"; /** * Regular expression for inhibitor compounds. This regexp has got 2 * capturing groups. */ private static final String INHIBITOR_REGEXP = "(.+?) is a(?:.*?)? inhibitor" + "|(?<![Nn]ot )\\b[Ii]n(?:hibited|activated)" + REGEXP_HOW + " by " + REGEXP_PROHIB + REGEXP_REL_CONC + REGEXP_COMP_GRP + "(.+?)" + REGEXP_EXPLANATION + "(?:" + "(?: or|"+INH_REGEXP_OTHERS_LESS + "|,"+INH_REGEXP_DONT_CONTINUE+") " + "(.+?)" + REGEXP_EXPLANATION + ")?" + INH_REGEXP_END; /** * Regular expression for activator compounds. This regexp has got 2 * capturing groups. */ private static final String ACTIVATOR_REGEXP = "(?<![Nn]ot )\\b[Aa]ctivated" + REGEXP_HOW + " by " + REGEXP_PROHIB + REGEXP_REL_CONC + REGEXP_COMP_GRP + "(.+?)" + REGEXP_EXPLANATION + "(?:" + "(?: or|"+ACT_REGEXP_OTHERS_LESS + "|,"+ACT_REGEXP_DONT_CONTINUE+") " + "(.+?)" + REGEXP_EXPLANATION + ")?" + ACT_REGEXP_END; private static final Pattern INHIBITOR_PATTERN = Pattern.compile(INHIBITOR_REGEXP); private static final Pattern ACTIVATOR_PATTERN = Pattern.compile(ACTIVATOR_REGEXP); /** * Extracts the prefix from a UniProt ID (strips the species suffix). * @param id a UniProt ID. * @return an ID without the species suffix. */ public static String getIdPrefix(String id) { return id.split("_")[0]; } /** * Extracts the prefixes from UniProt IDs (strips the species suffixes). * @param ids a collection of UniProt IDs. * @return a list of distinct UniProt IDs without the species suffix. */ public static List<String> getIdPrefixes(Collection<String> ids) { Set<String> prefixes = new LinkedHashSet<>(); for (String id : ids) { prefixes.add(getIdPrefix(id)); } return new ArrayList<>(prefixes); } /** * Converts the UniProt IDs to prefixes with a wildcard (<code>_*</code>) * at the end, which should match orthologs. * @param ids a collection of UniProt IDs, either complete (ex. ALR1_YEAST) * or just prefixes without wildcard (ex. ALR1). * @return a list of distinct UniProt IDs with the species suffix replaced * with a wildcard. */ public static List<String> getWildcardIds(Collection<String> ids){ Set<String> wIds = new LinkedHashSet<>(); for (String id : ids) { String idPrefix = id.indexOf('_') > -1? getIdPrefix(id) : id; wIds.add(idPrefix+"_*"); } return new ArrayList<>(wIds); } /** * Parses a text to get a list of molecules acting as inhibitors. * @param text an enzyme regulation comment from UniProt. * @return A list of molecules with just a name (no ID). It can be empty, * but never <code>null</code>. */ public static Set<String> parseTextForInhibitors(String text){ return getMolecules(INHIBITOR_PATTERN.matcher(text)); } /** * Parses a text to get a list of molecules acting as activators. * @param text an enzyme regulation comment from UniProt. * @return A list of molecules with just a name (no ID). It can be empty, * but never <code>null</code>. */ public static Set<String> parseTextForActivators(String text){ return getMolecules(ACTIVATOR_PATTERN.matcher(text)); } /** * Extracts and cleans names from a matcher, building a molecule for each * of them.<br/> * <i>Cleaning</i> includes removing conjunctions from the text and also * chemical concentrations.<br/> * Please note that any synonyms in parentheses - ex. <code>"AIM-100 * (4-amino-5,6-biaryl-furo[2,3-d]pyrimidine)"</code> will be included in * the molecule name. * @param m a matcher for activators/inhibitors patterns. * @return a list of molecules with just a name. The list can be empty, but * never <code>null</code>. */ private static Set<String> getMolecules(Matcher m) { Set<String> names = new TreeSet<>(new Comparator<String>() { @Override public int compare(String s1, String s2) { return s1.compareToIgnoreCase(s2); } }); //List<Molecule> molecules = new ArrayList<Molecule>(); while (m.find()){ for (int i = 1; i <= m.groupCount(); i++){ if (m.group(i) == null) continue; String[] namesArray = m.group(i) // remove concentrations (ex: 1 mM, 0.5 nM) .replaceAll("\\d+(\\.\\d+)? .M ", "") // remove stop words .replaceAll(",? and|,? or", ",") .split(", "); names.addAll(Arrays.asList(namesArray)); } } return names; } }