/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.twentyn.patentExtractor; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistryPOSTagger; import uk.ac.cam.ch.wwmm.chemicaltagger.Rule; import java.io.File; import java.io.FileFilter; import java.io.FileOutputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.zip.GZIPOutputStream; /** * This class applies the PatentDocument parser and PatentDocumentFeatures feature extractor to a patent document or * documents. * TODO: convert this to read raw ZIPs and output some compressed corpus of results. */ public class Runner { public static final Logger LOGGER = LogManager.getLogger(Runner.class); public static void main(String args[]) throws Exception { System.out.println("Runner starting up."); System.out.flush(); if (args.length == 0) { LOGGER.error("Must specify a directory to search."); System.exit(1); } List<File> toProcess = null; File splitFileOrDir = new File(args[0]); if (!(splitFileOrDir.exists())) { LOGGER.error("Unable to find directory at " + args[0]); System.exit(1); } if (splitFileOrDir.isDirectory()) { final Pattern filenamePattern = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9\\.]+(\\.gz)?$"); final Pattern resultPattern = Pattern.compile("\\.out$"); FileFilter filter = new FileFilter() { public boolean accept(File pathname) { return pathname.isFile() && filenamePattern.matcher(pathname.getName()).matches() && !resultPattern.matcher(pathname.getName()).find(); } }; toProcess = Arrays.asList(splitFileOrDir.listFiles(filter)); } else { toProcess = Collections.singletonList(splitFileOrDir); } Collections.sort(toProcess, new Comparator<File>() { @Override public int compare(File o1, File o2) { return o1.getName().compareTo(o2.getName()); } }); LOGGER.info("Setting up ChemistryPOSTagger"); /* Important note: ChemistryPOSTagger is *not* thread safe. As a singleton that configures itself on startup, * it doesn't seem feasible to use it safely in a multi-threaded environment. This class is necessarily * serial--any parallelism will need to be implemented at the process level. */ final ChemistryPOSTagger posTagger = ChemistryPOSTagger.getDefaultInstance(); List<Rule> rules = posTagger.getRegexTagger().getRules(); /* Add chemtagger rules for E. coli and S. cerevisiae, as we want to look for biosynthesis-related documents that * reference them specifically. TODO: add more organisms and other potentially interesting patterns. */ rules.add(new Rule("NN-ORGANISM", "e\\. +coli", true)); rules.add(new Rule("NN-ORGANISM", "s\\. +cerevisiae", true)); posTagger.getRegexTagger().setRules(rules); ObjectMapper mapper = new ObjectMapper(); mapper.enable(SerializationFeature.INDENT_OUTPUT); LOGGER.info("Processing " + toProcess.size() + " files."); final AtomicInteger processed = new AtomicInteger(0); final int toProcessSize = toProcess.size(); for (final File splitFile : toProcess) { int localProcessed; synchronized (processed) { localProcessed = processed.incrementAndGet(); } File outputFile = new File(splitFile.getParent(), splitFile.getName().concat(".out")); LOGGER.info("Processing file: " + splitFile.getAbsolutePath() + " -> " + outputFile.getAbsolutePath() + " (" + localProcessed + "/" + toProcessSize + ")"); if (outputFile.exists()) { LOGGER.info("!! Output exists for " + outputFile.getAbsolutePath() + ", skipping."); continue; } try { PatentDocument pdoc = PatentDocument.patentDocumentFromXMLFile(splitFile); if (pdoc == null) { LOGGER.warn("No patent doc produced from " + splitFile + ", skipping."); continue; } PatentDocumentFeatures pdf = PatentDocumentFeatures.extractPatentDocumentFeatures(posTagger, pdoc); OutputStream os = new GZIPOutputStream(new FileOutputStream(outputFile)); mapper.writeValue(os, pdf); os.flush(); os.close(); } catch (Exception e) { // NB: this is a Very Bad Thing, but we can't throw from here. LOGGER.error("Caught exception when processing " + splitFile, e); } } LOGGER.info("Done."); } }