/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.shared; public class GenePubmedCaseStudy { // We did two case studies on the words in Gene and Pubmed: // 1. ChangWords: Words from the papers/gene comments that we needed to infer genes for the chang paper. // See the post http://pathway-synthesis.tumblr.com/post/19209966057/looking-ahead-path-edges-to-gene-names // 2a. GeneEntries25Words: Words that we pulled out of 25 random gene entries (153 words) // 2b. PubmedEntries25Words: Words pulled out of 25 random pubmed entries (391 words) // // All these were filtered through the ~saurabhs/Downloads/pathway\ synthesis/organisms/hunspell-1.3.2/filter.sh script // That script runs a spell check using en_US.dic and output words that are misspelled (i.e., bio words) // That script is also responsible for checking whether those words are organism names. The remaining are potential chemical names public static String[] ChangWords = new String[] { "0g", "1g", "3R", "AF321779", "ATCC", "BCD", "BDH", "BUT1", "BUT2", "BYDH", "Butanol", "C2", "C4", "Clostridium", "CoA", "Coenzyme", "DC3000", "DSM", "Genebank", "H16", "HBD", "HPLC", "Intracellular", "Ki", "NADH", "NADP", "NADPH", "PHB", "PMID", "PUBMED", "PhaA", "PhaB", "PhaC", "Pseudomonas", "Ralstonia", "Rhizobia", "Streptomyces", "THL", "YP", "acetoacetyl", "acetobutylicum", "acetyacetyl", "acetyl", "acetyltransferase", "acidogenic", "ack", "acyl", "adhE2", "adhe", "adhe1", "aldehyde", "amino", "bcd", "biosynthesis", "biosynthetic", "bp", "butanol", "butanologenic", "butyraldehyde", "butyrate", "butyric", "butyryl", "ccr", "ccrA", "cfa8", "chromatography", "coenzyme", "coli", "collinus", "complementation", "coronafacic", "crotonoyl", "crotonyl", "crt", "dehydratase", "dehydrogenase", "dehydrogenases", "dodecyl", "electrophoretic", "enoyl", "etfA", "etfB", "eutropha", "hbd", "heterologous", "hydroxy", "hydroxybutanoyl", "hydroxybutyryl", "hyperproducing", "isologs", "isopalmitoyl", "kDa", "ketothiolase", "kinase", "lookup", "mM", "mannitol", "microM", "mol", "muM", "operon", "overexpress", "overexpressing", "oxidoreductase", "palmitoyl", "phaA", "phaB", "phaB1", "phaB2", "phaB3", "phosphotransbutyrylase", "polyacrylamide", "polyhydroxyalkanoate", "polyhydroxybutyrate", "pregrown", "ptb", "pv", "recombinant", "reductase", "reductases", "serendipitously", "solventogenic", "spectrophotometric", "str", "subunits", "synthase", "synthases", "syringae", "tac", "thiL", "thioesters", "thiolase", "thl", "titer", "tyrobutyricum", "phaB1", "phaB2", "phaB3", }; public static String[] GeneEntries25Words = new String[] { "AA049537", "APEC", "BW2952", "BX571856", "CoA", "EC4115", "EDL933", "EKO11", "Fpr", "H2", "H7", "Hpr", "IIA", "JH1", "KEGG", "KO11FL", "MG1655", "MRSA252", "MSSA476", "Mu3", "NAD", "NADP", "NCTC", "NTR", "O1", "O103", "O157", "O2R", "Ortholog", "PFAM", "Pta", "R6K", "SAB1560c", "SAHV", "SAOUHSC", "SAR2008", "SAS1659", "SAS1839", "SE11", "SSON", "SaurJH1", "TIGRFAM", "TrbB", "UDP", "UDPdiphospho", "UTI89", "Z3922", "accA", "acetyl", "acetylglucosamine", "acetylglucosaminyl", "acetylglucosaminyltransferase", "acetylmuramyl", "aconitate", "acyl", "acylglycerophosphoethanolamine", "acyltransferase", "adenosyl", "adenosylmethionine", "aminopropyl", "arcAB", "aureus", "auxilary", "b3947", "bifunctional", "biosynthesis", "carboxylase", "carboxytransferase", "catabolism", "chorismate", "citB", "citrate", "coli", "deaminase", "decarboxylase", "dehydratase", "dehydrogenase", "dehydropantoate", "deoxyribose", "deoxyuridine", "dihydroxy", "ethanolamine", "eutD", "fdoG", "ferredoxin", "formate", "fpr", "frameshift", "glpD", "guaB", "hemC", "homocysteine", "homodimeric", "hydratase", "hydroxymethylbilane", "hydroxymethyltransferase", "inositol", "isocitrate", "ligase", "lyase", "malate", "malic", "menC", "methionine", "methylenetetrahydrofolate", "methyltetrahydrofolate", "methyltransferase", "monophosphate", "murG", "muramoylpentapeptide", "oxobutanoate", "p1658", "panB", "pentapeptide", "peptidoglycan", "phospholipid", "phosphorylase", "phosphorylytic", "phosphotransacetylase", "phosphotransferase", "porphobilinogen", "porphyrin", "prephenate", "ptsA", "ptsP", "putrescine", "pyrophosphoryl", "pyruvate", "reductase", "ribose", "sab", "sao", "sn", "speD", "spermidine", "ssn", "str", "subsp", "substr", "subtilis", "subunit", "succinylbenzoyl", "synthase", "synthetase", "tetrahydrofolate", "transcriptional", "transferase", "transposase", "ubiquinol", "ubiquinone", "udp", "undecaprenol", "uridine" }; public static String[] PubmedEntries25Words = new String[] { "2A", "30a", "3H", "4E6", "A4", "ACAM2010", "APEC", "Acad", "Adenosylmethionine", "B1", "Bacteriol", "Biofilm", "C5", "CFA", "CFU", "CN", "Ca2", "Caenorhabditis", "Calsequestrin", "Complementation", "Conjugative", "CtsR", "Cys", "Disuccinimidyl", "EAST1", "EGF", "EGFR", "EHEC", "EHECs", "EI", "ELISPOT", "EMRSA", "ETEC", "Ec", "Enterobacter", "Enterobacteriaceae", "ExPEC", "FGF", "FGFR", "GLB", "Genomic", "Gln", "Globins", "Glutamyl", "H28", "H7", "HPr", "Hederstedt", "HemA", "Histidine", "Huynh", "IEs", "IFNGR1", "IIA", "IgA", "IgG", "Immunoglobulin", "IncF", "IncX1", "JAK", "JAKs", "K1", "K12", "Kafala", "Kd", "Lactobacillus", "Leu17", "Lymphoblastoid", "Lys", "Lys16", "MC4100", "MG1655", "MICs", "MRSA", "MRSA252", "MS2027", "MSSA476", "Mbp", "Microbiol", "Mu3", "Mu50", "MuLac", "Multistep", "N315", "NAD", "NH2", "NPr", "NaCNBH3", "Ntr", "O1", "O103", "O111", "O152", "O157", "O2", "O26", "ORF1", "ORF2", "PCR", "PKA", "PP1", "PP2A", "PRIP", "PcrA", "Petricek", "Phosphoryl", "Phosphorylation", "Plasmid", "Proc", "RNAIII", "RTKs", "Recombinantly", "Recsei", "RepD", "RepFII", "Rutberg", "SAR2632", "SCC", "SCCmec", "SDS", "SE11", "SLD1a", "SNP", "SNPs", "STATs", "Sa", "Sasarman", "Schrder", "Selenate", "Shiga", "Shigella", "SpLE3", "Subunit", "T3SS", "T4S", "Tn5", "Typhimurium", "Tyr", "UPEC", "UTI", "Uropathogenic", "VSSA", "Xa", "acceptor", "acetylglucosamine", "adenosylmethionine", "agr", "alanyl", "amination", "amino", "aminoglycosides", "aminolaevuline", "aminolevulinic", "aminotransferase", "ammonium", "amyloid", "anhydromannitol", "antithrombin", "arcA", "aroC", "aureus", "autotransporters", "autoxidation", "biofilm", "biomarkers", "biosynthesis", "biosynthetic", "bp", "calsequestrin", "carboxymethylation", "catalytically", "chromatography", "cofactor", "coli", "colonisation", "commensal", "commensality", "complementation", "conformational", "conjugative", "cooperativity", "crtM", "culturable", "cyanoborohydride", "cytoglobin", "cytokines", "daptomycin", "deaminase", "decarboxylase", "dehydratase", "di", "dichroism", "dihydroxy", "diplay", "dodecyl", "downstreatm", "effector", "electrophoresis", "elegans", "energetics", "enteric", "enterica", "enteroaggregative", "enterocyte", "enterohemorrhagic", "enterotoxigenic", "enterotoxin", "epigenetic", "epigenetics", "equimolar", "extracellular", "extraintestinal", "fibroblast", "fimbria", "fimbriae", "fluorometry", "forskolin", "fusidic", "genomic", "genomics", "genotyping", "globin", "globins", "glucosamine", "glucuronic", "glutamate", "glutamyl", "graR", "graRn", "graSR", "helicase", "hemA", "hemAXCDBL", "hemB", "hemC", "hemCDBL", "hemD", "hemL", "hemX", "histidine", "histidyl", "hydrophobicity", "hydroxymethylbilane", "iduronic", "immunogenic", "immunogenicity", "immunoprecipitate", "integron", "isoproterenol", "kDa", "kb", "kilodalton", "kinase", "kinases", "lactam", "lambdoid", "ligand", "ligands", "linoleic", "lymphoblastoid", "mM", "mV", "mec", "menC", "menD", "menE", "menF", "menFDHBCE", "menaquinone", "menaquinones", "methicillin", "microM", "microarrays", "mol", "multilocus", "murine", "mutagenesis", "mutualist", "nathphoic", "neuroglobin", "nosocomial", "nucleotide", "octasaccharide", "oligomers", "oligonucleotide", "oligopeptide", "oligosaccharide", "ompC", "ompF", "operon", "ortholog", "overexpressed", "overexpression", "oxacillin", "pAPEC", "pC221", "pJCB12", "pMAS2027", "paralogue", "paralogues", "pathogenesis", "pathogenome", "pentasaccharide", "peptide", "peptidoglycan", "periurethral", "phenotypes", "phenotypic", "phosphatase", "phosphatases", "phospho", "phosphoenolpyruvate", "phospholipase", "phosphoryl", "phosphorylate", "phosphorylated", "phosphorylates", "phosphorylation", "phosphotransferase", "phosphotyrosine", "phylogenetic", "phylogenomic", "placMu50", "plasmid", "plasmids", "polyacrylamide", "polymerase", "porphobilinogen", "preamyloid", "processivity", "prophage", "prophages", "protease", "proteolytic", "proteomics", "ptsP", "pyruvoyl", "qRT", "quaternary", "reactivity", "recombinant", "recombinational", "redox", "reductase", "regulons", "replicon", "resequencing", "reticulum", "rpoN", "sarA", "sarcoplasmic", "sasF", "selenate", "semialdehyde", "serine", "serotype", "serotypes", "serovar", "sigB", "signalling", "stabilisation", "staphyloxanthin", "subcellular", "suberate", "subtilis", "subunit", "subunits", "sulfated", "sulfonamides", "synthase", "tRNA", "tetrasaccharide", "threonine", "transcriptional", "transesterification", "translational", "translocation", "transposon", "trimethoprim", "typhimurium", "tyrosine", "uropathogenic", "uroporphyrinogen", "vancomycin", "vitro", "vivo", "vraE", "zoonotic", "C", }; }