PatentCorpusReader.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentExtractor;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

public class PatentCorpusReader {
  public static final Logger LOGGER = LogManager.getLogger(PatentCorpusReader.class);
  public static final String DOCUMENT_DELIMITER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
  public static final String LINE_SEPARATOR = System.lineSeparator();

  private PatentProcessor processor;
  private File inputFileOrDir;

  public PatentCorpusReader(PatentProcessor processor, File inputFileOrDir) {
    this.processor = processor;
    this.inputFileOrDir = inputFileOrDir;
  }

  public int readPatentCorpus()
      throws IOException, ParserConfigurationException,
      SAXException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {
    if (!(inputFileOrDir.exists())) {
      LOGGER.error("Unable to find directory at " + inputFileOrDir);
      return 0;
    }

    List<File> toProcess = null;
    if (inputFileOrDir.isDirectory()) {
      // Note: this regex is supposed to handle multiple levels of .'s, as might be produced by the `split` command.
      final Pattern filenamePattern = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9\\.]+$");
      final Pattern zipFilePattern = Pattern.compile("\\.zip$");
      FileFilter filter = new FileFilter() {
        public boolean accept(File pathname) {
          return pathname.isFile() &&
              filenamePattern.matcher(pathname.getName()).matches() &&
              zipFilePattern.matcher(pathname.getName()).find();
        }
      };
      toProcess = Arrays.asList(inputFileOrDir.listFiles(filter));
      Collections.sort(toProcess, new Comparator<File>() {
        @Override
        public int compare(File o1, File o2) {
          return o1.getName().compareTo(o2.getName());
        }
      });
    } else {
      toProcess = Collections.singletonList(inputFileOrDir);
    }
    LOGGER.info("Processing " + toProcess.size() + " files");

    ObjectMapper objectMapper = new ObjectMapper();
    objectMapper.enable(SerializationFeature.INDENT_OUTPUT);

    for (File currentFile : toProcess) {
      LOGGER.info("Processing file " + currentFile.getAbsolutePath());
      if (currentFile.getName().endsWith(".zip")) {
        LOGGER.debug("Zip compression detected.");
        // With help from
        // http://stackoverflow.com/questions/15667125/read-content-from-files-which-are-inside-zip-file
        ZipFile zipFile = new ZipFile(currentFile);
        Enumeration<? extends ZipEntry> entries = zipFile.entries();

        while (entries.hasMoreElements()) {
          ZipEntry entry = entries.nextElement();
          InputStream is = zipFile.getInputStream(entry);
          LOGGER.debug("Zip input stream is available: " + is.available());
          BufferedReader reader = new BufferedReader(new InputStreamReader(is));
          splitDocsAndClose(currentFile, reader);
        }
      } else {
        LOGGER.info("Processing file: " + currentFile);
        BufferedReader reader = new BufferedReader(new FileReader(currentFile));
        splitDocsAndClose(currentFile, reader);
      }
    }
    return toProcess.size();
  }

  /**
   * Given a file path (mostly for debugging) and a reader, read in a concatenated patent corpus, split the docs based
   * on a known delimiter, and call this.processor.processPatentText on each document.
   *
   * @param path The patent corpus file being read (mostly for debugging)
   * @param reader A reader for that file (which might be slurping in a compressed stream).
   * @throws IOException
   * @throws ParserConfigurationException
   * @throws SAXException
   * @throws TransformerConfigurationException
   * @throws TransformerException
   * @throws XPathExpressionException
   */
  private void splitDocsAndClose(File path, BufferedReader reader)
      throws IOException, ParserConfigurationException,
      SAXException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {
    LOGGER.debug("Input file reader is ready: " + reader.ready());

    StringBuilder stringBuilder = new StringBuilder();
    String line = null;
    int processed = 0;
    // TODO: Is there still no better way to do accomplish this w/ v7?
    while ((line = reader.readLine()) != null) {
      if (line.equals(DOCUMENT_DELIMITER) && stringBuilder.length() > 0) {
        String content = stringBuilder.toString();
        this.processor.processPatentText(path, new StringReader(content), content.length());
        stringBuilder = new StringBuilder(line).append(LINE_SEPARATOR);
        processed++;
        if ((processed % 100) == 0) {
          LOGGER.info("Processed " + processed + " documents");
        }
      } else {
        stringBuilder.append(line).append(LINE_SEPARATOR);
      }
    }
    if (stringBuilder.length() > 0) {
      String content = stringBuilder.toString();
      processor.processPatentText(path, new StringReader(content), content.length());
      processed++;
    }
    LOGGER.info("Found " + processed + " documents in " + path.getName());
  }
}