/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.wikipedia; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.Serializable; import java.util.HashSet; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.Arrays; import java.util.Set; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import chemaxon.formats.MolFormatException; import chemaxon.formats.MolImporter; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; /** * This class parses Wikipedia data dumps to extract important chemicals. When called from the command line, it exports * an important chemicals wikipedia file to be used by the Installer database. * Usage: * sbt 'runMain act.installer.wikipedia.ImportantChemicalsWikipedia * -i data/enwiki-20160501-pages-articles.xml * -o MNT_SHARED_DATA/Thomas/imp_chemicals_wikipedia.txt * -t' */ public class ImportantChemicalsWikipedia { private static final Logger LOGGER = LogManager.getFormatterLogger(ImportantChemicalsWikipedia.class); public static final CSVFormat TSV_FORMAT = CSVFormat.newFormat('\t'). withRecordSeparator('\n').withIgnoreEmptyLines(true).withCommentMarker('#'); public static final String OPTION_WIKIPEDIA_DUMP_FULL_PATH = "i"; public static final String OPTION_OUTPUT_PATH = "o"; public static final String OPTION_TSV_OUTPUT = "t"; public static final String HELP_MESSAGE = StringUtils.join(new String[]{ "This class parses Wikipedia data dumps to extract important chemicals." }, ""); public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_WIKIPEDIA_DUMP_FULL_PATH) .argName("WIKIPEDIA_DUMP_PATH") .desc("The full path to the Wikipedia XML dump to parse. It should be located on the NAS " + "(data/enwiki-20160501-pages-articles.xml) but can also be obtained from " + "https://dumps.wikimedia.org/enwiki/") .hasArg().required() .longOpt("wikipedia_dump_path") .type(String.class) ); add(Option.builder(OPTION_OUTPUT_PATH) .argName("OUTPUT_PATH") .desc("The full path to write the output data.") .hasArg().required() .longOpt("output_path") .type(String.class) ); add(Option.builder(OPTION_TSV_OUTPUT) .argName("TSV_OUTPUT") .desc("Whether the output should be written in TSV format.") .longOpt("tsv") .type(boolean.class) ); add(Option.builder("h") .argName("help") .desc("Prints this help message") .longOpt("help") ); }}; public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } private static final String DATABASE_TYPE = "WIKIPEDIA"; // Some Wikipedia pages contains InChI strings but are not about a specific Chemical. // A good heuristic to exclude them is to list words that appear in the titles. // A title is considered "valid" if it does not include any of these strings. private static final String[] EXCLUDE_TITLES_WITH_WORDS_LIST = new String[] {"Identifier", "Wikipedia", "InChI", "Template", "testcase"}; private static final Set<String> EXCLUDE_TITLES_WITH_WORDS = new HashSet<>( Arrays.asList(EXCLUDE_TITLES_WITH_WORDS_LIST)); // Some InChI cause fatal Java errors when trying to validate them through Chemaxon's library. Ignore them. // There does not seem to exist a more elegant way to do this. private static final String[] EXCLUDE_INCHIS_LIST = new String[] {"InChI = 1/C12H10AsCl/c14/h1-10H"}; private static final Set<String> EXCLUDE_INCHIS = new HashSet<>( Arrays.asList(EXCLUDE_INCHIS_LIST)); // These patterns allow to identify Wikipedia titles and InChIs. private static final Pattern TITLE_PATTERN = Pattern.compile(".*<title>([^<>]+)</title>.*"); private static final Pattern INCHI_PATTERN = Pattern.compile(".*(?i)(InChI[0-9]?\\p{Space}*=\\p{Space}*1S?/[\\p{Space}0-9a-z+\\-\\(\\)/.,\\?;\\*]+).*"); private static ObjectMapper mapper = new ObjectMapper(); private String lastTitle; private boolean isLastTitleValid; private static HashSet<ImportantChemical> importantChemicalsWikipedia = new HashSet<>(); public ImportantChemicalsWikipedia() {} public class ImportantChemical implements Serializable { @JsonProperty("type") private String type; @JsonProperty("dbid") private String dbid; @JsonProperty("inchi") private String inchi; @JsonProperty("metadata") private WikipediaMetadata metadata; public ImportantChemical(String type, String dbid, String inchi, WikipediaMetadata metadata) { this.type = type; this.dbid = dbid; this.inchi = inchi; this.metadata = metadata; } public String getType() { return type; } public String getDbid() { return dbid; } public String getInchi() { return inchi; } public WikipediaMetadata getMetadata() { return metadata; } } public class WikipediaMetadata { @JsonProperty("article") private String article; @JsonProperty("std_inchi") private boolean stdInChI; public WikipediaMetadata(String article, boolean stdInChI) { this.article = article; this.stdInChI = stdInChI; } } /** * This function extracts an InChI string from a candidate line. * @param line a String from the raw XML data source file * @return a String representing the molecule's InChI */ public String extractInchiFromLine(String line) { Matcher inchiMatcher = INCHI_PATTERN.matcher(line); if (inchiMatcher.matches()) { return inchiMatcher.group(1); } return null; } /** * This function formats a matched InChI to make it canonical. * @param inchi a String representing the molecule's InChI * @return a formatted string representing the corresponding canonical InChI */ public String formatInchiString(String inchi) { // Remove all whitespaces String tmpInchi = inchi.replaceAll("\\s+",""); // Some InChIs start with "InChI1" or "InChI2". We need to remove the suffix ("1", "2") to allow Chemaxon validation String formattedInchi = tmpInchi.replaceAll("InChI[0-9]?", "InChI"); return formattedInchi; } /** * This function tries to import a molecule in Chemaxon and returns a boolean indicating whether or not it succeeded. * @param inchi a string representing the molecule's canonical InChI * @return a boolean indicating success or failure to import the molecule in Chemaxon */ public boolean isChemaxonValidInchi(String inchi) { try { MolImporter.importMol(inchi); } catch (MolFormatException e) { return false; } return true; } /** * This function processes a line found to contain a candidate InChI and adds potential candidate molecules to the * important chemicals set. * @param line a String from the raw XML data source file */ public void processInchiLine(String line) throws IOException { String inchi; // Extract a potential Inchi from the line. Check if null. if ((inchi = extractInchiFromLine(line)) != null) { if (!EXCLUDE_INCHIS.contains(inchi)) { // InChI formatting String formattedInchi = formatInchiString(inchi); LOGGER.trace(formattedInchi); // InChI validation through Chemaxon library boolean isChemaxonValidInchi = isChemaxonValidInchi(formattedInchi); if (!isChemaxonValidInchi) { LOGGER.info("~~~~~~~~~~~~~~~~~~~~~~~~~"); LOGGER.info("Chemaxon validation failed"); LOGGER.info("Last title : %s", lastTitle); LOGGER.info("Extracted line : %s", line); LOGGER.info("Matched InChI : %s", inchi); LOGGER.info("Formatted InChI : %s", formattedInchi); } else { boolean isStandardInchi = formattedInchi.startsWith("InChI=1S"); String wikipediaURL = "https://en.wikipedia.org/wiki/" + lastTitle.replace(" ", "_"); WikipediaMetadata metadata = new WikipediaMetadata(lastTitle, isStandardInchi); ImportantChemical importantChemical = new ImportantChemical( DATABASE_TYPE , wikipediaURL, formattedInchi, metadata); importantChemicalsWikipedia.add(importantChemical); } } } } /** * This function processes a line from the data source to find titles or InChIs * @param line a String from the raw XML data source file */ public void processLine(String line) throws IOException { Matcher titleMatcher = TITLE_PATTERN.matcher(line); if (titleMatcher.matches()) { lastTitle = titleMatcher.group(1); isLastTitleValid = true; for (String excludedWord : EXCLUDE_TITLES_WITH_WORDS) { if (lastTitle.contains(excludedWord)) { isLastTitleValid = false; } } } else { if (isLastTitleValid) { String lowerCaseLine = line.toLowerCase(); if (lowerCaseLine.contains("inchi") && !lowerCaseLine.contains("inchikey") && !lowerCaseLine.contains("inchi_ref")) { processInchiLine(line); } } } } /** * This function writes the important chemicals set to a TSV file. * @param outputPath a String indicating where the file should be written (including its name) */ public void writeToTSV(String outputPath) { try { BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath)); CSVPrinter printer = new CSVPrinter(writer, TSV_FORMAT); printer.printComment("This file has been generated by the ImportantChemicalsWikipedia.java script."); printer.printComment("Format: WIKIPEDIA<tab><wikipedia url><tab><inchi><tab><metadata>"); for (ImportantChemical importantChemical : importantChemicalsWikipedia) { List<String> nextLine = new ArrayList<>(); nextLine.add(importantChemical.getType()); nextLine.add(importantChemical.getDbid()); nextLine.add(importantChemical.getInchi()); nextLine.add(mapper.writeValueAsString(importantChemical.getMetadata())); printer.printRecord(nextLine); } printer.flush(); writer.close(); } catch (IOException e) { throw new RuntimeException(e); } } /** * This function writes the important chemicals set to a JSON file. * @param outputPath a String indicating where the file should be written (including its name) */ public void writeToJSON(String outputPath) throws IOException { File file = new File(outputPath); mapper.writeValue(file, importantChemicalsWikipedia); } public static void main(final String[] args) throws IOException { Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(ImportantChemicalsWikipedia.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(ImportantChemicalsWikipedia.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } String inputPath = cl.getOptionValue(OPTION_WIKIPEDIA_DUMP_FULL_PATH, "1000"); String outputPath = cl.getOptionValue(OPTION_OUTPUT_PATH, "1000"); Boolean outputTSV = cl.hasOption(OPTION_TSV_OUTPUT); ImportantChemicalsWikipedia importantChemicalsWikipedia = new ImportantChemicalsWikipedia(); try (BufferedReader br = new BufferedReader(new FileReader(inputPath))) { String line; while ((line = br.readLine()) != null) { importantChemicalsWikipedia.processLine(line); } } catch (IOException e) { LOGGER.error(e); } if (outputTSV) { importantChemicalsWikipedia.writeToTSV(outputPath); } else { importantChemicalsWikipedia.writeToJSON(outputPath); } } }