/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.wikipedia;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashSet;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.Arrays;
import java.util.Set;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* This class parses Wikipedia data dumps to extract important chemicals. When called from the command line, it exports
* an important chemicals wikipedia file to be used by the Installer database.
* Usage:
* sbt 'runMain act.installer.wikipedia.ImportantChemicalsWikipedia
* -i data/enwiki-20160501-pages-articles.xml
* -o MNT_SHARED_DATA/Thomas/imp_chemicals_wikipedia.txt
* -t'
*/
public class ImportantChemicalsWikipedia {
private static final Logger LOGGER = LogManager.getFormatterLogger(ImportantChemicalsWikipedia.class);
public static final CSVFormat TSV_FORMAT = CSVFormat.newFormat('\t').
withRecordSeparator('\n').withIgnoreEmptyLines(true).withCommentMarker('#');
public static final String OPTION_WIKIPEDIA_DUMP_FULL_PATH = "i";
public static final String OPTION_OUTPUT_PATH = "o";
public static final String OPTION_TSV_OUTPUT = "t";
public static final String HELP_MESSAGE = StringUtils.join(new String[]{
"This class parses Wikipedia data dumps to extract important chemicals."
}, "");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_WIKIPEDIA_DUMP_FULL_PATH)
.argName("WIKIPEDIA_DUMP_PATH")
.desc("The full path to the Wikipedia XML dump to parse. It should be located on the NAS " +
"(data/enwiki-20160501-pages-articles.xml) but can also be obtained from " +
"https://dumps.wikimedia.org/enwiki/")
.hasArg().required()
.longOpt("wikipedia_dump_path")
.type(String.class)
);
add(Option.builder(OPTION_OUTPUT_PATH)
.argName("OUTPUT_PATH")
.desc("The full path to write the output data.")
.hasArg().required()
.longOpt("output_path")
.type(String.class)
);
add(Option.builder(OPTION_TSV_OUTPUT)
.argName("TSV_OUTPUT")
.desc("Whether the output should be written in TSV format.")
.longOpt("tsv")
.type(boolean.class)
);
add(Option.builder("h")
.argName("help")
.desc("Prints this help message")
.longOpt("help")
);
}};
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
private static final String DATABASE_TYPE = "WIKIPEDIA";
// Some Wikipedia pages contains InChI strings but are not about a specific Chemical.
// A good heuristic to exclude them is to list words that appear in the titles.
// A title is considered "valid" if it does not include any of these strings.
private static final String[] EXCLUDE_TITLES_WITH_WORDS_LIST =
new String[] {"Identifier", "Wikipedia", "InChI", "Template", "testcase"};
private static final Set<String> EXCLUDE_TITLES_WITH_WORDS = new HashSet<>(
Arrays.asList(EXCLUDE_TITLES_WITH_WORDS_LIST));
// Some InChI cause fatal Java errors when trying to validate them through Chemaxon's library. Ignore them.
// There does not seem to exist a more elegant way to do this.
private static final String[] EXCLUDE_INCHIS_LIST =
new String[] {"InChI = 1/C12H10AsCl/c14/h1-10H"};
private static final Set<String> EXCLUDE_INCHIS = new HashSet<>(
Arrays.asList(EXCLUDE_INCHIS_LIST));
// These patterns allow to identify Wikipedia titles and InChIs.
private static final Pattern TITLE_PATTERN = Pattern.compile(".*<title>([^<>]+)</title>.*");
private static final Pattern INCHI_PATTERN =
Pattern.compile(".*(?i)(InChI[0-9]?\\p{Space}*=\\p{Space}*1S?/[\\p{Space}0-9a-z+\\-\\(\\)/.,\\?;\\*]+).*");
private static ObjectMapper mapper = new ObjectMapper();
private String lastTitle;
private boolean isLastTitleValid;
private static HashSet<ImportantChemical> importantChemicalsWikipedia = new HashSet<>();
public ImportantChemicalsWikipedia() {}
public class ImportantChemical implements Serializable {
@JsonProperty("type")
private String type;
@JsonProperty("dbid")
private String dbid;
@JsonProperty("inchi")
private String inchi;
@JsonProperty("metadata")
private WikipediaMetadata metadata;
public ImportantChemical(String type, String dbid, String inchi, WikipediaMetadata metadata) {
this.type = type;
this.dbid = dbid;
this.inchi = inchi;
this.metadata = metadata;
}
public String getType() {
return type;
}
public String getDbid() {
return dbid;
}
public String getInchi() {
return inchi;
}
public WikipediaMetadata getMetadata() {
return metadata;
}
}
public class WikipediaMetadata {
@JsonProperty("article")
private String article;
@JsonProperty("std_inchi")
private boolean stdInChI;
public WikipediaMetadata(String article, boolean stdInChI) {
this.article = article;
this.stdInChI = stdInChI;
}
}
/**
* This function extracts an InChI string from a candidate line.
* @param line a String from the raw XML data source file
* @return a String representing the molecule's InChI
*/
public String extractInchiFromLine(String line) {
Matcher inchiMatcher = INCHI_PATTERN.matcher(line);
if (inchiMatcher.matches()) {
return inchiMatcher.group(1);
}
return null;
}
/**
* This function formats a matched InChI to make it canonical.
* @param inchi a String representing the molecule's InChI
* @return a formatted string representing the corresponding canonical InChI
*/
public String formatInchiString(String inchi) {
// Remove all whitespaces
String tmpInchi = inchi.replaceAll("\\s+","");
// Some InChIs start with "InChI1" or "InChI2". We need to remove the suffix ("1", "2") to allow Chemaxon validation
String formattedInchi = tmpInchi.replaceAll("InChI[0-9]?", "InChI");
return formattedInchi;
}
/**
* This function tries to import a molecule in Chemaxon and returns a boolean indicating whether or not it succeeded.
* @param inchi a string representing the molecule's canonical InChI
* @return a boolean indicating success or failure to import the molecule in Chemaxon
*/
public boolean isChemaxonValidInchi(String inchi) {
try {
MolImporter.importMol(inchi);
} catch (MolFormatException e) {
return false;
}
return true;
}
/**
* This function processes a line found to contain a candidate InChI and adds potential candidate molecules to the
* important chemicals set.
* @param line a String from the raw XML data source file
*/
public void processInchiLine(String line) throws IOException {
String inchi;
// Extract a potential Inchi from the line. Check if null.
if ((inchi = extractInchiFromLine(line)) != null) {
if (!EXCLUDE_INCHIS.contains(inchi)) {
// InChI formatting
String formattedInchi = formatInchiString(inchi);
LOGGER.trace(formattedInchi);
// InChI validation through Chemaxon library
boolean isChemaxonValidInchi = isChemaxonValidInchi(formattedInchi);
if (!isChemaxonValidInchi) {
LOGGER.info("~~~~~~~~~~~~~~~~~~~~~~~~~");
LOGGER.info("Chemaxon validation failed");
LOGGER.info("Last title : %s", lastTitle);
LOGGER.info("Extracted line : %s", line);
LOGGER.info("Matched InChI : %s", inchi);
LOGGER.info("Formatted InChI : %s", formattedInchi);
} else {
boolean isStandardInchi = formattedInchi.startsWith("InChI=1S");
String wikipediaURL = "https://en.wikipedia.org/wiki/" + lastTitle.replace(" ", "_");
WikipediaMetadata metadata = new WikipediaMetadata(lastTitle, isStandardInchi);
ImportantChemical importantChemical = new ImportantChemical(
DATABASE_TYPE , wikipediaURL, formattedInchi, metadata);
importantChemicalsWikipedia.add(importantChemical);
}
}
}
}
/**
* This function processes a line from the data source to find titles or InChIs
* @param line a String from the raw XML data source file
*/
public void processLine(String line) throws IOException {
Matcher titleMatcher = TITLE_PATTERN.matcher(line);
if (titleMatcher.matches()) {
lastTitle = titleMatcher.group(1);
isLastTitleValid = true;
for (String excludedWord : EXCLUDE_TITLES_WITH_WORDS) {
if (lastTitle.contains(excludedWord)) {
isLastTitleValid = false;
}
}
} else {
if (isLastTitleValid) {
String lowerCaseLine = line.toLowerCase();
if (lowerCaseLine.contains("inchi") && !lowerCaseLine.contains("inchikey")
&& !lowerCaseLine.contains("inchi_ref")) {
processInchiLine(line);
}
}
}
}
/**
* This function writes the important chemicals set to a TSV file.
* @param outputPath a String indicating where the file should be written (including its name)
*/
public void writeToTSV(String outputPath) {
try {
BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath));
CSVPrinter printer = new CSVPrinter(writer, TSV_FORMAT);
printer.printComment("This file has been generated by the ImportantChemicalsWikipedia.java script.");
printer.printComment("Format: WIKIPEDIA<tab><wikipedia url><tab><inchi><tab><metadata>");
for (ImportantChemical importantChemical : importantChemicalsWikipedia) {
List<String> nextLine = new ArrayList<>();
nextLine.add(importantChemical.getType());
nextLine.add(importantChemical.getDbid());
nextLine.add(importantChemical.getInchi());
nextLine.add(mapper.writeValueAsString(importantChemical.getMetadata()));
printer.printRecord(nextLine);
}
printer.flush();
writer.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* This function writes the important chemicals set to a JSON file.
* @param outputPath a String indicating where the file should be written (including its name)
*/
public void writeToJSON(String outputPath) throws IOException {
File file = new File(outputPath);
mapper.writeValue(file, importantChemicalsWikipedia);
}
public static void main(final String[] args) throws IOException {
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
System.err.format("Argument parsing failed: %s\n", e.getMessage());
HELP_FORMATTER.printHelp(ImportantChemicalsWikipedia.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
if (cl.hasOption("help")) {
HELP_FORMATTER.printHelp(ImportantChemicalsWikipedia.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
return;
}
String inputPath = cl.getOptionValue(OPTION_WIKIPEDIA_DUMP_FULL_PATH, "1000");
String outputPath = cl.getOptionValue(OPTION_OUTPUT_PATH, "1000");
Boolean outputTSV = cl.hasOption(OPTION_TSV_OUTPUT);
ImportantChemicalsWikipedia importantChemicalsWikipedia = new ImportantChemicalsWikipedia();
try (BufferedReader br = new BufferedReader(new FileReader(inputPath))) {
String line;
while ((line = br.readLine()) != null) {
importantChemicalsWikipedia.processLine(line);
}
}
catch (IOException e) {
LOGGER.error(e);
}
if (outputTSV) {
importantChemicalsWikipedia.writeToTSV(outputPath);
} else {
importantChemicalsWikipedia.writeToJSON(outputPath);
}
}
}