/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.shared;
public class GenePubmedCaseStudy {
// We did two case studies on the words in Gene and Pubmed:
// 1. ChangWords: Words from the papers/gene comments that we needed to infer genes for the chang paper.
// See the post http://pathway-synthesis.tumblr.com/post/19209966057/looking-ahead-path-edges-to-gene-names
// 2a. GeneEntries25Words: Words that we pulled out of 25 random gene entries (153 words)
// 2b. PubmedEntries25Words: Words pulled out of 25 random pubmed entries (391 words)
//
// All these were filtered through the ~saurabhs/Downloads/pathway\ synthesis/organisms/hunspell-1.3.2/filter.sh script
// That script runs a spell check using en_US.dic and output words that are misspelled (i.e., bio words)
// That script is also responsible for checking whether those words are organism names. The remaining are potential chemical names
public static String[] ChangWords = new String[] {
"0g",
"1g",
"3R",
"AF321779",
"ATCC",
"BCD",
"BDH",
"BUT1",
"BUT2",
"BYDH",
"Butanol",
"C2",
"C4",
"Clostridium",
"CoA",
"Coenzyme",
"DC3000",
"DSM",
"Genebank",
"H16",
"HBD",
"HPLC",
"Intracellular",
"Ki",
"NADH",
"NADP",
"NADPH",
"PHB",
"PMID",
"PUBMED",
"PhaA",
"PhaB",
"PhaC",
"Pseudomonas",
"Ralstonia",
"Rhizobia",
"Streptomyces",
"THL",
"YP",
"acetoacetyl",
"acetobutylicum",
"acetyacetyl",
"acetyl",
"acetyltransferase",
"acidogenic",
"ack",
"acyl",
"adhE2",
"adhe",
"adhe1",
"aldehyde",
"amino",
"bcd",
"biosynthesis",
"biosynthetic",
"bp",
"butanol",
"butanologenic",
"butyraldehyde",
"butyrate",
"butyric",
"butyryl",
"ccr",
"ccrA",
"cfa8",
"chromatography",
"coenzyme",
"coli",
"collinus",
"complementation",
"coronafacic",
"crotonoyl",
"crotonyl",
"crt",
"dehydratase",
"dehydrogenase",
"dehydrogenases",
"dodecyl",
"electrophoretic",
"enoyl",
"etfA",
"etfB",
"eutropha",
"hbd",
"heterologous",
"hydroxy",
"hydroxybutanoyl",
"hydroxybutyryl",
"hyperproducing",
"isologs",
"isopalmitoyl",
"kDa",
"ketothiolase",
"kinase",
"lookup",
"mM",
"mannitol",
"microM",
"mol",
"muM",
"operon",
"overexpress",
"overexpressing",
"oxidoreductase",
"palmitoyl",
"phaA",
"phaB",
"phaB1",
"phaB2",
"phaB3",
"phosphotransbutyrylase",
"polyacrylamide",
"polyhydroxyalkanoate",
"polyhydroxybutyrate",
"pregrown",
"ptb",
"pv",
"recombinant",
"reductase",
"reductases",
"serendipitously",
"solventogenic",
"spectrophotometric",
"str",
"subunits",
"synthase",
"synthases",
"syringae",
"tac",
"thiL",
"thioesters",
"thiolase",
"thl",
"titer",
"tyrobutyricum",
"phaB1",
"phaB2",
"phaB3",
};
public static String[] GeneEntries25Words = new String[] {
"AA049537",
"APEC",
"BW2952",
"BX571856",
"CoA",
"EC4115",
"EDL933",
"EKO11",
"Fpr",
"H2",
"H7",
"Hpr",
"IIA",
"JH1",
"KEGG",
"KO11FL",
"MG1655",
"MRSA252",
"MSSA476",
"Mu3",
"NAD",
"NADP",
"NCTC",
"NTR",
"O1",
"O103",
"O157",
"O2R",
"Ortholog",
"PFAM",
"Pta",
"R6K",
"SAB1560c",
"SAHV",
"SAOUHSC",
"SAR2008",
"SAS1659",
"SAS1839",
"SE11",
"SSON",
"SaurJH1",
"TIGRFAM",
"TrbB",
"UDP",
"UDPdiphospho",
"UTI89",
"Z3922",
"accA",
"acetyl",
"acetylglucosamine",
"acetylglucosaminyl",
"acetylglucosaminyltransferase",
"acetylmuramyl",
"aconitate",
"acyl",
"acylglycerophosphoethanolamine",
"acyltransferase",
"adenosyl",
"adenosylmethionine",
"aminopropyl",
"arcAB",
"aureus",
"auxilary",
"b3947",
"bifunctional",
"biosynthesis",
"carboxylase",
"carboxytransferase",
"catabolism",
"chorismate",
"citB",
"citrate",
"coli",
"deaminase",
"decarboxylase",
"dehydratase",
"dehydrogenase",
"dehydropantoate",
"deoxyribose",
"deoxyuridine",
"dihydroxy",
"ethanolamine",
"eutD",
"fdoG",
"ferredoxin",
"formate",
"fpr",
"frameshift",
"glpD",
"guaB",
"hemC",
"homocysteine",
"homodimeric",
"hydratase",
"hydroxymethylbilane",
"hydroxymethyltransferase",
"inositol",
"isocitrate",
"ligase",
"lyase",
"malate",
"malic",
"menC",
"methionine",
"methylenetetrahydrofolate",
"methyltetrahydrofolate",
"methyltransferase",
"monophosphate",
"murG",
"muramoylpentapeptide",
"oxobutanoate",
"p1658",
"panB",
"pentapeptide",
"peptidoglycan",
"phospholipid",
"phosphorylase",
"phosphorylytic",
"phosphotransacetylase",
"phosphotransferase",
"porphobilinogen",
"porphyrin",
"prephenate",
"ptsA",
"ptsP",
"putrescine",
"pyrophosphoryl",
"pyruvate",
"reductase",
"ribose",
"sab",
"sao",
"sn",
"speD",
"spermidine",
"ssn",
"str",
"subsp",
"substr",
"subtilis",
"subunit",
"succinylbenzoyl",
"synthase",
"synthetase",
"tetrahydrofolate",
"transcriptional",
"transferase",
"transposase",
"ubiquinol",
"ubiquinone",
"udp",
"undecaprenol",
"uridine"
};
public static String[] PubmedEntries25Words = new String[] {
"2A",
"30a",
"3H",
"4E6",
"A4",
"ACAM2010",
"APEC",
"Acad",
"Adenosylmethionine",
"B1",
"Bacteriol",
"Biofilm",
"C5",
"CFA",
"CFU",
"CN",
"Ca2",
"Caenorhabditis",
"Calsequestrin",
"Complementation",
"Conjugative",
"CtsR",
"Cys",
"Disuccinimidyl",
"EAST1",
"EGF",
"EGFR",
"EHEC",
"EHECs",
"EI",
"ELISPOT",
"EMRSA",
"ETEC",
"Ec",
"Enterobacter",
"Enterobacteriaceae",
"ExPEC",
"FGF",
"FGFR",
"GLB",
"Genomic",
"Gln",
"Globins",
"Glutamyl",
"H28",
"H7",
"HPr",
"Hederstedt",
"HemA",
"Histidine",
"Huynh",
"IEs",
"IFNGR1",
"IIA",
"IgA",
"IgG",
"Immunoglobulin",
"IncF",
"IncX1",
"JAK",
"JAKs",
"K1",
"K12",
"Kafala",
"Kd",
"Lactobacillus",
"Leu17",
"Lymphoblastoid",
"Lys",
"Lys16",
"MC4100",
"MG1655",
"MICs",
"MRSA",
"MRSA252",
"MS2027",
"MSSA476",
"Mbp",
"Microbiol",
"Mu3",
"Mu50",
"MuLac",
"Multistep",
"N315",
"NAD",
"NH2",
"NPr",
"NaCNBH3",
"Ntr",
"O1",
"O103",
"O111",
"O152",
"O157",
"O2",
"O26",
"ORF1",
"ORF2",
"PCR",
"PKA",
"PP1",
"PP2A",
"PRIP",
"PcrA",
"Petricek",
"Phosphoryl",
"Phosphorylation",
"Plasmid",
"Proc",
"RNAIII",
"RTKs",
"Recombinantly",
"Recsei",
"RepD",
"RepFII",
"Rutberg",
"SAR2632",
"SCC",
"SCCmec",
"SDS",
"SE11",
"SLD1a",
"SNP",
"SNPs",
"STATs",
"Sa",
"Sasarman",
"Schrder",
"Selenate",
"Shiga",
"Shigella",
"SpLE3",
"Subunit",
"T3SS",
"T4S",
"Tn5",
"Typhimurium",
"Tyr",
"UPEC",
"UTI",
"Uropathogenic",
"VSSA",
"Xa",
"acceptor",
"acetylglucosamine",
"adenosylmethionine",
"agr",
"alanyl",
"amination",
"amino",
"aminoglycosides",
"aminolaevuline",
"aminolevulinic",
"aminotransferase",
"ammonium",
"amyloid",
"anhydromannitol",
"antithrombin",
"arcA",
"aroC",
"aureus",
"autotransporters",
"autoxidation",
"biofilm",
"biomarkers",
"biosynthesis",
"biosynthetic",
"bp",
"calsequestrin",
"carboxymethylation",
"catalytically",
"chromatography",
"cofactor",
"coli",
"colonisation",
"commensal",
"commensality",
"complementation",
"conformational",
"conjugative",
"cooperativity",
"crtM",
"culturable",
"cyanoborohydride",
"cytoglobin",
"cytokines",
"daptomycin",
"deaminase",
"decarboxylase",
"dehydratase",
"di",
"dichroism",
"dihydroxy",
"diplay",
"dodecyl",
"downstreatm",
"effector",
"electrophoresis",
"elegans",
"energetics",
"enteric",
"enterica",
"enteroaggregative",
"enterocyte",
"enterohemorrhagic",
"enterotoxigenic",
"enterotoxin",
"epigenetic",
"epigenetics",
"equimolar",
"extracellular",
"extraintestinal",
"fibroblast",
"fimbria",
"fimbriae",
"fluorometry",
"forskolin",
"fusidic",
"genomic",
"genomics",
"genotyping",
"globin",
"globins",
"glucosamine",
"glucuronic",
"glutamate",
"glutamyl",
"graR",
"graRn",
"graSR",
"helicase",
"hemA",
"hemAXCDBL",
"hemB",
"hemC",
"hemCDBL",
"hemD",
"hemL",
"hemX",
"histidine",
"histidyl",
"hydrophobicity",
"hydroxymethylbilane",
"iduronic",
"immunogenic",
"immunogenicity",
"immunoprecipitate",
"integron",
"isoproterenol",
"kDa",
"kb",
"kilodalton",
"kinase",
"kinases",
"lactam",
"lambdoid",
"ligand",
"ligands",
"linoleic",
"lymphoblastoid",
"mM",
"mV",
"mec",
"menC",
"menD",
"menE",
"menF",
"menFDHBCE",
"menaquinone",
"menaquinones",
"methicillin",
"microM",
"microarrays",
"mol",
"multilocus",
"murine",
"mutagenesis",
"mutualist",
"nathphoic",
"neuroglobin",
"nosocomial",
"nucleotide",
"octasaccharide",
"oligomers",
"oligonucleotide",
"oligopeptide",
"oligosaccharide",
"ompC",
"ompF",
"operon",
"ortholog",
"overexpressed",
"overexpression",
"oxacillin",
"pAPEC",
"pC221",
"pJCB12",
"pMAS2027",
"paralogue",
"paralogues",
"pathogenesis",
"pathogenome",
"pentasaccharide",
"peptide",
"peptidoglycan",
"periurethral",
"phenotypes",
"phenotypic",
"phosphatase",
"phosphatases",
"phospho",
"phosphoenolpyruvate",
"phospholipase",
"phosphoryl",
"phosphorylate",
"phosphorylated",
"phosphorylates",
"phosphorylation",
"phosphotransferase",
"phosphotyrosine",
"phylogenetic",
"phylogenomic",
"placMu50",
"plasmid",
"plasmids",
"polyacrylamide",
"polymerase",
"porphobilinogen",
"preamyloid",
"processivity",
"prophage",
"prophages",
"protease",
"proteolytic",
"proteomics",
"ptsP",
"pyruvoyl",
"qRT",
"quaternary",
"reactivity",
"recombinant",
"recombinational",
"redox",
"reductase",
"regulons",
"replicon",
"resequencing",
"reticulum",
"rpoN",
"sarA",
"sarcoplasmic",
"sasF",
"selenate",
"semialdehyde",
"serine",
"serotype",
"serotypes",
"serovar",
"sigB",
"signalling",
"stabilisation",
"staphyloxanthin",
"subcellular",
"suberate",
"subtilis",
"subunit",
"subunits",
"sulfated",
"sulfonamides",
"synthase",
"tRNA",
"tetrasaccharide",
"threonine",
"transcriptional",
"transesterification",
"translational",
"translocation",
"transposon",
"trimethoprim",
"typhimurium",
"tyrosine",
"uropathogenic",
"uroporphyrinogen",
"vancomycin",
"vitro",
"vivo",
"vraE",
"zoonotic",
"C",
};
}