Runner.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentExtractor;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistryPOSTagger;
import uk.ac.cam.ch.wwmm.chemicaltagger.Rule;

import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;

/**
 * This class applies the PatentDocument parser and PatentDocumentFeatures feature extractor to a patent document or
 * documents.
 * TODO: convert this to read raw ZIPs and output some compressed corpus of results.
 */
public class Runner {
  public static final Logger LOGGER = LogManager.getLogger(Runner.class);

  public static void main(String args[]) throws Exception {
    System.out.println("Runner starting up.");
    System.out.flush();

    if (args.length == 0) {
      LOGGER.error("Must specify a directory to search.");
      System.exit(1);
    }

    List<File> toProcess = null;

    File splitFileOrDir = new File(args[0]);
    if (!(splitFileOrDir.exists())) {
      LOGGER.error("Unable to find directory at " + args[0]);
      System.exit(1);
    }

    if (splitFileOrDir.isDirectory()) {
      final Pattern filenamePattern = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9\\.]+(\\.gz)?$");
      final Pattern resultPattern = Pattern.compile("\\.out$");
      FileFilter filter = new FileFilter() {
        public boolean accept(File pathname) {
          return pathname.isFile() &&
              filenamePattern.matcher(pathname.getName()).matches() &&
              !resultPattern.matcher(pathname.getName()).find();
        }
      };
      toProcess = Arrays.asList(splitFileOrDir.listFiles(filter));
    } else {
      toProcess = Collections.singletonList(splitFileOrDir);
    }

    Collections.sort(toProcess, new Comparator<File>() {
      @Override
      public int compare(File o1, File o2) {
        return o1.getName().compareTo(o2.getName());
      }
    });

    LOGGER.info("Setting up ChemistryPOSTagger");

    /* Important note: ChemistryPOSTagger is *not* thread safe.  As a singleton that configures itself on startup,
     * it doesn't seem feasible to use it safely in a multi-threaded environment.  This class is necessarily
     * serial--any parallelism will need to be implemented at the process level. */
    final ChemistryPOSTagger posTagger = ChemistryPOSTagger.getDefaultInstance();
    List<Rule> rules = posTagger.getRegexTagger().getRules();
    /* Add chemtagger rules for E. coli and S. cerevisiae, as we want to look for biosynthesis-related documents that
     * reference them specifically.  TODO: add more organisms and other potentially interesting patterns. */
    rules.add(new Rule("NN-ORGANISM", "e\\. +coli", true));
    rules.add(new Rule("NN-ORGANISM", "s\\. +cerevisiae", true));
    posTagger.getRegexTagger().setRules(rules);

    ObjectMapper mapper = new ObjectMapper();
    mapper.enable(SerializationFeature.INDENT_OUTPUT);

    LOGGER.info("Processing " + toProcess.size() + " files.");

    final AtomicInteger processed = new AtomicInteger(0);
    final int toProcessSize = toProcess.size();
    for (final File splitFile : toProcess) {
      int localProcessed;
      synchronized (processed) {
        localProcessed = processed.incrementAndGet();
      }

      File outputFile = new File(splitFile.getParent(), splitFile.getName().concat(".out"));
      LOGGER.info("Processing file: " + splitFile.getAbsolutePath() +
          " -> " + outputFile.getAbsolutePath() + " (" + localProcessed + "/" + toProcessSize + ")");

      if (outputFile.exists()) {
        LOGGER.info("!! Output exists for " + outputFile.getAbsolutePath() + ", skipping.");
        continue;
      }

      try {
        PatentDocument pdoc = PatentDocument.patentDocumentFromXMLFile(splitFile);
        if (pdoc == null) {
          LOGGER.warn("No patent doc produced from " + splitFile + ", skipping.");
          continue;
        }
        PatentDocumentFeatures pdf = PatentDocumentFeatures.extractPatentDocumentFeatures(posTagger, pdoc);

        OutputStream os = new GZIPOutputStream(new FileOutputStream(outputFile));
        mapper.writeValue(os, pdf);
        os.flush();
        os.close();
      } catch (Exception e) { // NB: this is a Very Bad Thing, but we can't throw from here.
        LOGGER.error("Caught exception when processing " + splitFile, e);
      }
    }

    LOGGER.info("Done.");
  }
}