DocumentIndexer.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentSearch;

import com.twentyn.patentExtractor.PatentCorpusReader;
import com.twentyn.patentExtractor.PatentDocument;
import com.twentyn.patentExtractor.PatentProcessor;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.LoggerContext;
import org.apache.logging.log4j.core.config.Configuration;
import org.apache.logging.log4j.core.config.LoggerConfig;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.Charset;

/**
 * This class reads concatenated USPTO XML patent documents (as distributed in Google's patent full-text corpus) and
 * indexes their contents with minimal normalization.  Lucene is used for indexing; the output of this class's main
 * method is a Lucene index of the contents of any specified patent files.  A single file or directory of files can be
 * supplied as input.  If a directory is specified, this class will read and index any .zip files in that directory,
 * assuming these to be compressed USPTO dumps.
 */
public class DocumentIndexer implements PatentProcessor {
  public static final Logger LOGGER = LogManager.getLogger(DocumentIndexer.class);

  public static void main(String[] args) throws Exception {
    System.out.println("Starting up...");
    System.out.flush();
    Options opts = new Options();
    opts.addOption(Option.builder("i").
        longOpt("input").hasArg().required().desc("Input file or directory to index").build());
    opts.addOption(Option.builder("x").
        longOpt("index").hasArg().required().desc("Path to index file to generate").build());
    opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
    opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

    HelpFormatter helpFormatter = new HelpFormatter();
    CommandLineParser cmdLineParser = new DefaultParser();
    CommandLine cmdLine = null;
    try {
      cmdLine = cmdLineParser.parse(opts, args);
    } catch (ParseException e) {
      System.out.println("Caught exception when parsing command line: " + e.getMessage());
      helpFormatter.printHelp("DocumentIndexer", opts);
      System.exit(1);
    }

    if (cmdLine.hasOption("help")) {
      helpFormatter.printHelp("DocumentIndexer", opts);
      System.exit(0);
    }

    if (cmdLine.hasOption("verbose")) {
      // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2
      LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
      Configuration ctxConfig = ctx.getConfiguration();
      LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
      logConfig.setLevel(Level.DEBUG);

      ctx.updateLoggers();
      LOGGER.debug("Verbose logging enabled");
    }

    LOGGER.info("Opening index at " + cmdLine.getOptionValue("index"));
    Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath());

    /* The standard analyzer is too aggressive with chemical entities (it strips structural annotations, for one
     * thing), and the whitespace analyzer doesn't do any case normalization or stop word elimination.  This custom
     * analyzer appears to treat chemical entities better than the standard analyzer without admitting too much
     * cruft to the index. */
    Analyzer analyzer = CustomAnalyzer.builder().
        withTokenizer("whitespace").
        addTokenFilter("lowercase").
        addTokenFilter("stop").
        build();

    IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer);
    writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    writerConfig.setRAMBufferSizeMB(1 << 10);
    IndexWriter indexWriter = new IndexWriter(indexDir, writerConfig);

    String inputFileOrDir = cmdLine.getOptionValue("input");
    File splitFileOrDir = new File(inputFileOrDir);
    if (!(splitFileOrDir.exists())) {
      LOGGER.error("Unable to find directory at " + inputFileOrDir);
      System.exit(1);
    }

    DocumentIndexer indexer = new DocumentIndexer(indexWriter);
    PatentCorpusReader corpusReader = new PatentCorpusReader(indexer, splitFileOrDir);
    corpusReader.readPatentCorpus();
    indexer.commitAndClose();
  }

  IndexWriter indexWriter;

  public DocumentIndexer(IndexWriter indexWriter) {
    this.indexWriter = indexWriter;
  }

  @Override
  public void processPatentText(File patentFile, Reader patentTextReader, int patentTextLength)
      throws IOException, ParserConfigurationException,
      SAXException, TransformerConfigurationException,
      TransformerException, XPathExpressionException {
    PatentDocument patentDocument = PatentDocument.patentDocumentFromXMLStream(
        new ReaderInputStream(patentTextReader, Charset.forName("utf-8")));
    if (patentDocument == null) {
      LOGGER.info("Found non-patent type document, skipping.");
      return;
    }
    Document doc = patentDocToLuceneDoc(patentFile, patentDocument);
    LOGGER.debug("Adding document: " + doc.get("id"));
    this.indexWriter.addDocument(doc);
    this.indexWriter.commit();
  }

  public Document patentDocToLuceneDoc(File path, PatentDocument patentDoc) {
    // With help from https://lucene.apache.org/core/5_2_0/demo/src-html/org/apache/lucene/demo/IndexFiles.html
    Document doc = new Document();
    doc.add(new StringField("file_name", path.getName(), Field.Store.YES));
    doc.add(new StringField("id", patentDoc.getFileId(), Field.Store.YES));
    doc.add(new StringField("grant_date", patentDoc.getGrantDate(), Field.Store.YES));
    doc.add(new StringField("main_classification", patentDoc.getMainClassification(), Field.Store.YES));
    doc.add(new TextField("title", patentDoc.getTitle(), Field.Store.YES));
    doc.add(new TextField("claims", StringUtils.join("\n", patentDoc.getClaimsText()), Field.Store.NO));
    doc.add(new TextField("description", StringUtils.join("\n", patentDoc.getTextContent()), Field.Store.NO));

    // TODO: verify that these are searchable as expected.
    for (String cls : patentDoc.getFurtherClassifications()) {
      doc.add(new SortedSetDocValuesField("further_classification", new BytesRef(cls)));
    }
    for (String cls : patentDoc.getSearchedClassifications()) {
      doc.add(new SortedSetDocValuesField("searched_classification", new BytesRef(cls)));
    }

    return doc;
  }

  public void commitAndClose() throws IOException {
    if (!indexWriter.isOpen()) {
      return;
    }
    indexWriter.commit();
    indexWriter.close();
  }
}