/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.twentyn.chemicalClassifier;
import uk.ac.cam.ch.wwmm.oscar.Oscar;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ChemicalStructure;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.ResolvedNamedEntity;
import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* A quick and dirty OSCAR classification driver to run over Chris's TSV of chemical names and InChIs.
*/
public class Runner {
public static void main(String[] args) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(args[0]));
BufferedWriter writer = new BufferedWriter(new FileWriter(args[1]));
try {
Oscar oscar = new Oscar();
String line = null;
/* NOTE: this is exactly the wrong way to write a TSV reader. Caveat emptor.
* See http://tburette.github.io/blog/2014/05/25/so-you-want-to-write-your-own-CSV-code/
* and then use org.apache.commons.csv.CSVParser instead.
*/
while ((line = reader.readLine()) != null) {
// TSV means split on tabs! Nothing else will do.
List<String> fields = Arrays.asList(line.split("\t"));
// Choke if our invariants aren't satisfied. We expect ever line to have a name and an InChI.
if (fields.size() != 2) {
throw new RuntimeException(
String.format("Found malformed line (all lines must have two fields: %s", line));
}
String name = fields.get(1);
List<ResolvedNamedEntity> entities = oscar.findAndResolveNamedEntities(name);
System.out.println("**********");
System.out.println("Name: " + name);
List<String> outputFields = new ArrayList<>(fields.size() + 1);
outputFields.addAll(fields);
if (entities.size() == 0) {
System.out.println("No match");
outputFields.add("noMatch");
} else if (entities.size() == 1) {
ResolvedNamedEntity entity = entities.get(0);
NamedEntity ne = entity.getNamedEntity();
if (ne.getStart() != 0 || ne.getEnd() != name.length()) {
System.out.println("Partial match");
printEntity(entity);
outputFields.add("partialMatch");
} else {
System.out.println("Exact match");
printEntity(entity);
outputFields.add("exactMatch");
List<ChemicalStructure> structures = entity.getChemicalStructures(FormatType.STD_INCHI);
for (ChemicalStructure s : structures) {
outputFields.add(s.getValue());
}
}
} else { // Multiple matches found!
System.out.println("Multiple matches");
for (ResolvedNamedEntity e : entities) {
printEntity(e);
}
outputFields.add("multipleMatches");
}
writer.write(String.join("\t", outputFields));
writer.newLine();
}
} finally {
writer.flush();
writer.close();
}
}
public static void printEntity(ResolvedNamedEntity e) {
System.out.println(" " + e.getNamedEntity() + " @ " + e.getNamedEntity().getConfidence());
System.out.println(" " + e.getChemicalStructures(FormatType.STD_INCHI));
}
}