/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.twentyn.patentExtractor; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.xpath.XPathExpressionException; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Enumeration; import java.util.List; import java.util.regex.Pattern; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; public class PatentCorpusReader { public static final Logger LOGGER = LogManager.getLogger(PatentCorpusReader.class); public static final String DOCUMENT_DELIMITER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; public static final String LINE_SEPARATOR = System.lineSeparator(); private PatentProcessor processor; private File inputFileOrDir; public PatentCorpusReader(PatentProcessor processor, File inputFileOrDir) { this.processor = processor; this.inputFileOrDir = inputFileOrDir; } public int readPatentCorpus() throws IOException, ParserConfigurationException, SAXException, TransformerConfigurationException, TransformerException, XPathExpressionException { if (!(inputFileOrDir.exists())) { LOGGER.error("Unable to find directory at " + inputFileOrDir); return 0; } List<File> toProcess = null; if (inputFileOrDir.isDirectory()) { // Note: this regex is supposed to handle multiple levels of .'s, as might be produced by the `split` command. final Pattern filenamePattern = Pattern.compile("^[a-zA-Z0-9][a-zA-Z0-9\\.]+$"); final Pattern zipFilePattern = Pattern.compile("\\.zip$"); FileFilter filter = new FileFilter() { public boolean accept(File pathname) { return pathname.isFile() && filenamePattern.matcher(pathname.getName()).matches() && zipFilePattern.matcher(pathname.getName()).find(); } }; toProcess = Arrays.asList(inputFileOrDir.listFiles(filter)); Collections.sort(toProcess, new Comparator<File>() { @Override public int compare(File o1, File o2) { return o1.getName().compareTo(o2.getName()); } }); } else { toProcess = Collections.singletonList(inputFileOrDir); } LOGGER.info("Processing " + toProcess.size() + " files"); ObjectMapper objectMapper = new ObjectMapper(); objectMapper.enable(SerializationFeature.INDENT_OUTPUT); for (File currentFile : toProcess) { LOGGER.info("Processing file " + currentFile.getAbsolutePath()); if (currentFile.getName().endsWith(".zip")) { LOGGER.debug("Zip compression detected."); // With help from // http://stackoverflow.com/questions/15667125/read-content-from-files-which-are-inside-zip-file ZipFile zipFile = new ZipFile(currentFile); Enumeration<? extends ZipEntry> entries = zipFile.entries(); while (entries.hasMoreElements()) { ZipEntry entry = entries.nextElement(); InputStream is = zipFile.getInputStream(entry); LOGGER.debug("Zip input stream is available: " + is.available()); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); splitDocsAndClose(currentFile, reader); } } else { LOGGER.info("Processing file: " + currentFile); BufferedReader reader = new BufferedReader(new FileReader(currentFile)); splitDocsAndClose(currentFile, reader); } } return toProcess.size(); } /** * Given a file path (mostly for debugging) and a reader, read in a concatenated patent corpus, split the docs based * on a known delimiter, and call this.processor.processPatentText on each document. * * @param path The patent corpus file being read (mostly for debugging) * @param reader A reader for that file (which might be slurping in a compressed stream). * @throws IOException * @throws ParserConfigurationException * @throws SAXException * @throws TransformerConfigurationException * @throws TransformerException * @throws XPathExpressionException */ private void splitDocsAndClose(File path, BufferedReader reader) throws IOException, ParserConfigurationException, SAXException, TransformerConfigurationException, TransformerException, XPathExpressionException { LOGGER.debug("Input file reader is ready: " + reader.ready()); StringBuilder stringBuilder = new StringBuilder(); String line = null; int processed = 0; // TODO: Is there still no better way to do accomplish this w/ v7? while ((line = reader.readLine()) != null) { if (line.equals(DOCUMENT_DELIMITER) && stringBuilder.length() > 0) { String content = stringBuilder.toString(); this.processor.processPatentText(path, new StringReader(content), content.length()); stringBuilder = new StringBuilder(line).append(LINE_SEPARATOR); processed++; if ((processed % 100) == 0) { LOGGER.info("Processed " + processed + " documents"); } } else { stringBuilder.append(line).append(LINE_SEPARATOR); } } if (stringBuilder.length() > 0) { String content = stringBuilder.toString(); processor.processPatentText(path, new StringReader(content), content.length()); processed++; } LOGGER.info("Found " + processed + " documents in " + path.getName()); } }