StanfordCoreNLPClient.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.FileSequentialCollection;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.logging.StanfordRedwoodConfiguration;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static edu.stanford.nlp.util.logging.Redwood.Util.*;

/**
 * An annotation pipeline in spirit identical to {@link StanfordCoreNLP}, but
 * with the backend supported by a web server.
 *
 * @author Gabor Angeli
 */
@SuppressWarnings("FieldCanBeLocal")
public class StanfordCoreNLPClient extends AnnotationPipeline  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(StanfordCoreNLPClient.class);

  /** A simple URL spec, for parsing backend URLs */
  private static final Pattern URL_PATTERN = Pattern.compile("(?:(https?)://)?([^:]+):([0-9]+)?");

  /**
   * Information on how to connect to a backend.
   * The semantics of one of these objects is as follows:
   * <ul>
   *   <li>It should define a hostname and port to connect to.</li>
   *   <li>This represents ONE thread on the remote server. The client should
   *       treat it as such.</li>
   *   <li>Two backends that are .equals() point to the same endpoint, but there can be
   *       multiple of them if we want to run multiple threads on that endpoint.</li>
   * </ul>
   */
  private static class Backend {
    /** The protocol to connect to the server with. */
    public final String protocol;
    /** The hostname of the server running the CoreNLP annotators */
    public final String host;
    /** The port of the server running the CoreNLP annotators */
    public final int port;
    public Backend(String protocol, String host, int port) {
      this.protocol = protocol;
      this.host = host;
      this.port = port;
    }
    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      if (!(o instanceof Backend)) return false;
      Backend backend = (Backend) o;
      return port == backend.port && protocol.equals(backend.protocol) && host.equals(backend.host);
    }
    @Override
    public int hashCode() {
      throw new IllegalStateException("Hashing backends is dangerous!");
    }

    @Override
    public String toString() {
      return protocol + "://" + host + ":" + port;
    }
  }

  /**
   * A special type of {@link Thread}, which is responsible for scheduling jobs
   * on the backend.
   */
  private static class BackendScheduler extends Thread {
    /**
     * The list of backends that we can schedule on.
     * This should not generally be called directly from anywhere
     */
    public final List<Backend> backends;

    /**
     * The queue on requests for the scheduler to handle.
     * Each element of this queue is a function: calling the function signals
     * that this backend is available to perform a task on the passed backend.
     * It is then obligated to call the passed Consumer to signal that it has
     * released control of the backend, and it can be used for other things.
     * Remember to lock access to this object with {@link BackendScheduler#stateLock}.
     */
    private final Queue<BiConsumer<Backend, Consumer<Backend>>> queue;
    /**
     * The lock on access to {@link BackendScheduler#queue}.
     */
    private final Lock stateLock = new ReentrantLock();
    /**
     * Represents the event that an item has been added to the work queue.
     * Linked to {@link BackendScheduler#stateLock}.
     */
    private final Condition enqueued = stateLock.newCondition();
    /**
     * Represents the event that the queue has become empty, and this schedule is no
     * longer needed.
     */
    public final Condition shouldShutdown = stateLock.newCondition();

    /**
     * The queue of annotators (backends) that are free to be run on.
     * Remember to lock access to this object with {@link BackendScheduler#stateLock}.
     */
    private final Queue<Backend> freeAnnotators;
    /**
     * Represents the event that an annotator has freed up and is available for
     * work on the {@link BackendScheduler#freeAnnotators} queue.
     * Linked to {@link BackendScheduler#stateLock}.
     */
    private final Condition newlyFree = stateLock.newCondition();

    /**
     * While this is true, continue running the scheduler.
     */
    private boolean doRun = true;

    /**
     * Create a new scheduler from a list of backends.
     * These can contain duplicates -- in that case, that many concurrent
     * calls can be made to that backend.
     */
    public BackendScheduler(List<Backend> backends) {
      super();
      setDaemon(true);
      this.backends = backends;
      this.freeAnnotators = new LinkedList<>(backends);
      this.queue = new LinkedList<>();
    }

    /** {@inheritDoc} */
    @Override
    public void run() {
      try {
        while (doRun) {
          // Wait for a request
          BiConsumer<Backend, Consumer<Backend>> request;
          Backend annotator;
          stateLock.lock();
          try {
            while (queue.isEmpty()) {
              enqueued.await();
              if (!doRun) {
                return;
              }
            }
            // Get the actual request
            request = queue.poll();
            // We have a request

            // Find a free annotator
            while (freeAnnotators.isEmpty()) {
              newlyFree.await();
            }
            annotator = freeAnnotators.poll();
          } finally {
            stateLock.unlock();
          }
          // We have an annotator

          // Run the annotation
          request.accept(annotator, freedAnnotator -> {
            // ASYNC: we've freed this annotator
            // add it back to the queue and register it as available
            stateLock.lock();
            try {
              freeAnnotators.add(freedAnnotator);

              // If the queue is empty, and all the annotators have returned, we're done
              if (queue.isEmpty() && freeAnnotators.size() == backends.size()) {
                log.debug("All annotations completed. Signaling for shutdown");
                shouldShutdown.signalAll();
              }

              newlyFree.signal();
            } finally {
              stateLock.unlock();
            }
          });
          // Annotator is running (in parallel, most likely)
        }
      } catch (InterruptedException e) {
        throw new RuntimeException(e);
      }
    }

    /**
     * Schedule a new job on the backend
     * @param annotate A callback, which will be called when a backend is free
     *                 to do some processing. The implementation of this callback
     *                 MUST CALL the second argument when it is done processing,
     *                 to register the backend as free for further work.
     */
    public void schedule(BiConsumer<Backend, Consumer<Backend>> annotate) {
      stateLock.lock();
      try {
        queue.add(annotate);
        enqueued.signal();
      } finally {
        stateLock.unlock();
      }
    }
  } // end static class BackEndScheduler

  /** The path on the server to connect to. */
  private final String path = "";
  /** The Properties file to annotate with. */
  private final Properties properties;

  /** The Properties file to send to the server, serialized as JSON. */
  private final String propsAsJSON;

  /** The API key to authenticate with, or null */
  private final String apiKey;
  /** The API secret to authenticate with, or null */
  private final String apiSecret;

  /** The scheduler to use when running on multiple backends at a time */
  private final BackendScheduler scheduler;

  /**
   * The annotation serializer responsible for translating between the wire format
   * (protocol buffers) and the {@link Annotation} classes.
   */
  private final ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer(true);

  /**
   * The main constructor. Create a client from a properties file and a list of backends.
   * Note that this creates at least one Daemon thread.
   *
   * @param properties The properties file, as would be passed to {@link StanfordCoreNLP}.
   * @param backends The backends to run on.
   * @param apiKey The key to authenticate with as a username
   * @param apiSecret The key to authenticate with as a password
   */
  private StanfordCoreNLPClient(Properties properties, List<Backend> backends,
                                String apiKey, String apiSecret) {
    // Save the constructor variables
    this.properties = properties;
    Properties serverProperties = new Properties();
    for (String key : properties.stringPropertyNames()) {
      serverProperties.setProperty(key, properties.getProperty(key));
    }
    Collections.shuffle(backends, new Random(System.currentTimeMillis()));
    this.scheduler = new BackendScheduler(backends);
    this.apiKey = apiKey;
    this.apiSecret = apiSecret;

    // Set required serverProperties
    serverProperties.setProperty("inputFormat", "serialized");
    serverProperties.setProperty("outputFormat", "serialized");
    serverProperties.setProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName());
    serverProperties.setProperty("outputSerializer", ProtobufAnnotationSerializer.class.getName());

    // Create a list of all the properties, as JSON map elements
    List<String> jsonProperties = serverProperties.stringPropertyNames().stream().map(key -> '"' + JSONOutputter.cleanJSON(key) + "\": \"" +
        JSONOutputter
            .cleanJSON(serverProperties.getProperty(key)) + '"')
        .collect(Collectors.toList());
    // Create the JSON object
    this.propsAsJSON = "{ " + StringUtils.join(jsonProperties, ", ") + " }";

    // Start 'er up
    this.scheduler.start();
  }


  /**
   * The main constructor without credentials.
   *
   * @see StanfordCoreNLPClient#StanfordCoreNLPClient(Properties, List, String, String)
   */
  private StanfordCoreNLPClient(Properties properties, List<Backend> backends) {
    this(properties, backends, null, null);
  }


  /**
   * Run the client, pulling credentials from the environment.
   * Throws an IllegalStateException if the required environment variables aren't set.
   * These are:
   *
   * <ul>
   *   <li>CORENLP_HOST</li>
   *   <li>CORENLP_KEY</li>
   *   <li>CORENLP_SECRET</li>
   * </ul>
   *
   * @throws IllegalStateException Thrown if we could not read the required environment variables.
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties) throws IllegalStateException {
    this(properties,
        Optional.ofNullable(System.getenv("CORENLP_HOST")).orElseThrow(() -> new IllegalStateException("Environment variable CORENLP_HOST not specified")),
        Optional.ofNullable(System.getenv("CORENLP_HOST")).map(x -> x.startsWith("http://") ? 80 : 443).orElse(443),
        1,
        Optional.ofNullable(System.getenv("CORENLP_KEY")).orElse(null),
        Optional.ofNullable(System.getenv("CORENLP_SECRET")).orElse(null)
      );
  }


  /**
   * Run on a single backend.
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host, int port) {
    this(properties, host, port, 1);
  }

  /**
   * Run on a single backend, with authentication
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host, int port,
                               String apiKey, String apiSecret) {
    this(properties, host, port, 1, apiKey, apiSecret);
  }


  /**
   * Run on a single backend, with authentication
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host,
                               String apiKey, String apiSecret) {
    this(properties, host, host.startsWith("http://") ? 80 : 443, 1, apiKey, apiSecret);
  }

  /**
   * Run on a single backend, but with k threads on each backend.
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  @SuppressWarnings("unused")
  public StanfordCoreNLPClient(Properties properties, String host, int port, int threads) {
    this(properties, host, port, threads, null, null);
  }


  /**
   * Run on a single backend, but with k threads on each backend, and with authentication
   *
   * @see StanfordCoreNLPClient (Properties, List)
   */
  public StanfordCoreNLPClient(Properties properties, String host, int port, int threads,
                               String apiKey, String apiSecret) {
    this(properties, new ArrayList<Backend>() {{
      for (int i = 0; i < threads; ++i) {
        add(new Backend(host.startsWith("http://") ? "http" : "https",
            host.startsWith("http://") ? host.substring("http://".length()) : (host.startsWith("https://") ? host.substring("https://".length()) : host),
            port));
      }
    }},
    apiKey, apiSecret);
  }

  /**
   * {@inheritDoc}
   *
   * This method creates an async call to the server, and blocks until the server
   * has finished annotating the object.
   */
  @Override
  public void annotate(Annotation annotation) {
    final Lock lock = new ReentrantLock();
    final Condition annotationDone = lock.newCondition();
    annotate(Collections.singleton(annotation), 1, (Annotation annInput) -> {
      try {
        lock.lock();
        annotationDone.signal();
      } finally {
        lock.unlock();
      }
    });
    try {
      lock.lock();
      annotationDone.await();  // Only wait for one callback to complete; only annotating one document
    } catch (InterruptedException e) {
      log.info("Interrupt while waiting for annotation to return");
    } finally {
      lock.unlock();
    }
  }

  /**
   * This method fires off a request to the server. Upon returning, it calls the provided
   * callback method.
   *
   * @param annotations The input annotations to process
   * @param numThreads The number of threads to run on. IGNORED in this class.
   * @param callback A function to be called when an annotation finishes.
   */
  @Override
  public void annotate(final Iterable<Annotation> annotations, int numThreads, final Consumer<Annotation> callback){
    for (Annotation annotation : annotations) {
      annotate(annotation, callback);
    }
  }


  /**
   * The canonical entry point of the client annotator.
   * Create an HTTP request, send this annotation to the server, and await a response.
   *
   * @param annotation The annotation to annotate.
   * @param callback Called when the server has returned an annotated document.
   *                 The input to this callback is the same as the passed Annotation object.
   */
  public void annotate(final Annotation annotation, final Consumer<Annotation> callback) {
    scheduler.schedule((Backend backend, Consumer<Backend> isFinishedCallback) -> new Thread(() -> {
      try {
        // 1. Create the input
        // 1.1 Create a protocol buffer
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        serializer.write(annotation, os);
        os.close();
        byte[] message = os.toByteArray();
        // 1.2 Create the query params

        String queryParams = String.format(
            "properties=%s",
            URLEncoder.encode(StanfordCoreNLPClient.this.propsAsJSON, "utf-8"));

        // 2. Create a connection
        URL serverURL = new URL(backend.protocol, backend.host,
            backend.port,
            StanfordCoreNLPClient.this.path + '?' + queryParams);

        // 3. Do the annotation
        //    This method has two contracts:
        //    1. It should call the two relevant callbacks
        //    2. It must not throw an exception
        doAnnotation(annotation, backend, serverURL, message, 0);
      } catch (Throwable t) {
        log.warn("Could not annotate via server! Trying to annotate locally...", t);
        StanfordCoreNLP corenlp = new StanfordCoreNLP(properties);
        corenlp.annotate(annotation);
      } finally {
        callback.accept(annotation);
        isFinishedCallback.accept(backend);
      }
    }).start());
  }


  /**
   * Actually try to perform the annotation on the server side.
   * This is factored out so that we can retry up to 3 times.
   *
   * @param annotation The annotation we need to fill.
   * @param backend The backend we are querying against.
   * @param serverURL The URL of the server we are hitting.
   * @param message The message we are sending the server (don't need to recompute each retry).
   * @param tries The number of times we've tried already.
   */
  @SuppressWarnings("unchecked")
  private void doAnnotation(Annotation annotation, Backend backend, URL serverURL, byte[] message, int tries) {

    try {
      // 1. Set up the connection
      URLConnection connection = serverURL.openConnection();
      // 1.1 Set authentication
      if (apiKey != null && apiSecret != null) {
        String userpass = apiKey + ":" + apiSecret;
        String basicAuth = "Basic " + new String(Base64.getEncoder().encode(userpass.getBytes()));
        connection.setRequestProperty("Authorization", basicAuth);
      }
      // 1.2 Set some protocol-independent properties
      connection.setDoOutput(true);
      connection.setRequestProperty("Content-Type", "application/x-protobuf");
      connection.setRequestProperty("Content-Length", Integer.toString(message.length));
      connection.setRequestProperty("Accept-Charset", "utf-8");
      connection.setRequestProperty("User-Agent", StanfordCoreNLPClient.class.getName());
      // 1.3 Set some protocol-dependent properties
      switch (backend.protocol) {
        case "https":
        case "http":
          ((HttpURLConnection) connection).setRequestMethod("POST");
          break;
        default:
          throw new IllegalStateException("Haven't implemented protocol: " + backend.protocol);
      }

      // 2. Annotate
      // 2.1. Fire off the request
      connection.connect();
      connection.getOutputStream().write(message);
      connection.getOutputStream().flush();
      // 2.2 Await a response
      // -- It might be possible to send more than one message, but we are not going to do that.
      Annotation response = serializer.read(connection.getInputStream()).first;
      // 2.3. Copy response over to original annotation
      for (Class key : response.keySet()) {
        annotation.set(key, response.get(key));
      }

    } catch (Throwable t) {
      // 3. We encountered an error -- retry
      if (tries < 3) {
        log.warn(t);
        doAnnotation(annotation, backend, serverURL, message, tries + 1);
      } else {
        throw new RuntimeException(t);
      }
    }
  }


  /**
   * Runs the entire pipeline on the content of the given text passed in.
   * @param text The text to process
   * @return An Annotation object containing the output of all annotators
   */
  public Annotation process(String text) {
    Annotation annotation = new Annotation(text);
    annotate(annotation);
    return annotation;
  }


  /**
   * Runs an interactive shell where input text is processed with the given pipeline.
   *
   * @param pipeline The pipeline to be used
   * @throws IOException If IO problem with stdin
   */
  private static void shell(StanfordCoreNLPClient pipeline) throws IOException {
    log.info("Entering interactive shell. Type q RETURN or EOF to quit.");
    final StanfordCoreNLP.OutputFormat outputFormat = StanfordCoreNLP.OutputFormat.valueOf(pipeline.properties.getProperty("outputFormat", "text").toUpperCase());
    IOUtils.console("NLP> ", line -> {
      if ( ! line.isEmpty()) {
        Annotation anno = pipeline.process(line);
        try {
          switch (outputFormat) {
            case XML:
              new XMLOutputter().print(anno, System.out);
              break;
            case JSON:
              new JSONOutputter().print(anno, System.out);
              System.out.println();
              break;
            case CONLL:
              new CoNLLOutputter().print(anno, System.out);
              System.out.println();
              break;
            case TEXT:
              new TextOutputter().print(anno, System.out);
              break;
            case SERIALIZED:
              warn("You probably cannot read the serialized output, so printing in text instead");
              new TextOutputter().print(anno, System.out);
              break;
            default:
              throw new IllegalArgumentException("Cannot output in format " + outputFormat + " from the interactive shell");
          }
        } catch (IOException e) {
          throw new RuntimeIOException(e);
        }
      }
    });
  }

  /**
   * The implementation of what to run on a command-line call of CoreNLPWebClient
   *
   * @throws IOException If any IO problem
   */
  public void run() throws IOException {
    StanfordRedwoodConfiguration.minimalSetup();
    StanfordCoreNLP.OutputFormat outputFormat = StanfordCoreNLP.OutputFormat.valueOf(properties.getProperty("outputFormat", "text").toUpperCase());

    //
    // Process one file or a directory of files
    //
    if (properties.containsKey("file") || properties.containsKey("textFile")) {
      String fileName = properties.getProperty("file");
      if (fileName == null) {
        fileName = properties.getProperty("textFile");
      }
      Collection<File> files = new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true);
      StanfordCoreNLP.processFiles(null, files, 1, properties, this::annotate,
          StanfordCoreNLP.createOutputter(properties, new AnnotationOutputter.Options()), outputFormat);
    }

    //
    // Process a list of files
    //
    else if (properties.containsKey("filelist")){
      String fileName = properties.getProperty("filelist");
      Collection<File> inputFiles = StanfordCoreNLP.readFileList(fileName);
      Collection<File> files = new ArrayList<>(inputFiles.size());
      for (File file : inputFiles) {
        if (file.isDirectory()) {
          files.addAll(new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true));
        } else {
          files.add(file);
        }
      }
      StanfordCoreNLP.processFiles(null, files, 1, properties, this::annotate,
          StanfordCoreNLP.createOutputter(properties, new AnnotationOutputter.Options()), outputFormat);
    }

    //
    // Run the interactive shell
    //
    else {
      shell(this);
    }
  }

  /**
   * <p>
   *   Good practice to call after you are done with this object.
   *   Shuts down the queue of annotations to run and the associated threads.
   * </p>
   *
   * <p>
   *   If this is not called, any job which has been scheduled but not run will be
   *   cancelled.
   * </p>
   */
  public void shutdown() throws InterruptedException {
    scheduler.stateLock.lock();
    try {
      while (!scheduler.queue.isEmpty() || scheduler.freeAnnotators.size() != scheduler.backends.size()) {
        scheduler.shouldShutdown.await(5, TimeUnit.SECONDS);
      }
      scheduler.doRun = false;
      scheduler.enqueued.signalAll();  // In case the thread's waiting on this condition
    } finally {
      scheduler.stateLock.unlock();
    }
  }


  /**
   * This can be used just for testing or for command-line text processing.
   * This runs the pipeline you specify on the
   * text in the file that you specify and sends some results to stdout.
   * The current code in this main method assumes that each line of the file
   * is to be processed separately as a single sentence.
   * <p>
   * Example usage:<br>
   * java -mx6g edu.stanford.nlp.pipeline.StanfordCoreNLP -props properties -backends site1:port1,site2,port2 <br>
   *    or just -host name -port number
   *
   * @param args List of required properties
   * @throws java.io.IOException If IO problem
   * @throws ClassNotFoundException If class loading problem
   */
  public static void main(String[] args) throws IOException, ClassNotFoundException {
    //
    // process the arguments
    //
    // extract all the properties from the command line
    // if cmd line is empty, set the properties to null. The processor will search for the properties file in the classpath
    // if (args.length < 2) {
    //   log.info("Usage: " + StanfordCoreNLPClient.class.getSimpleName() + " -host <hostname> -port <port> ...");
    //   System.exit(1);
    // }
    Properties props = StringUtils.argsToProperties(args);
    boolean hasH = props.containsKey("h");
    boolean hasHelp = props.containsKey("help");
    if (hasH || hasHelp) {
      String helpValue = hasH ? props.getProperty("h") : props.getProperty("help");
      StanfordCoreNLP.printHelp(System.err, helpValue);
      return;
    }

    // Create the backends
    List<Backend> backends = new ArrayList<>();
    String defaultBack = "http://localhost:9000";
    String backStr = props.getProperty("backends");
    if (backStr == null) {
      String host = props.getProperty("host");
      String port = props.getProperty("port");
      if (host != null) {
        if (port != null) {
          defaultBack = host + ':' + port;
        } else {
          defaultBack = host;
        }
      }
    }

    for (String spec : props.getProperty("backends", defaultBack).split(",")) {
      Matcher matcher = URL_PATTERN.matcher(spec.trim());
      if (matcher.matches()) {
        String protocol = matcher.group(1);
        if (protocol == null) {
          protocol = "http";
        }
        String host = matcher.group(2);
        int port = 80;
        String portStr = matcher.group(3);
        if (portStr != null) {
          port = Integer.parseInt(portStr);
        }
        backends.add(new Backend(protocol, host, port));
      }
    }
    log.info("Using backends: " + backends);

    // Run the pipeline
    StanfordCoreNLPClient client = new StanfordCoreNLPClient(props, backends);
    client.run();
    try {
      client.shutdown();  // In case anything is pending on the server
    } catch (InterruptedException ignored) { }
  } // end main()

}