package edu.stanford.nlp.pipeline; import edu.stanford.nlp.io.FileSequentialCollection; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.util.logging.StanfordRedwoodConfiguration; import java.io.*; import java.net.*; import java.util.*; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.function.BiConsumer; import java.util.function.Consumer; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import static edu.stanford.nlp.util.logging.Redwood.Util.*; /** * An annotation pipeline in spirit identical to {@link StanfordCoreNLP}, but * with the backend supported by a web server. * * @author Gabor Angeli */ @SuppressWarnings("FieldCanBeLocal") public class StanfordCoreNLPClient extends AnnotationPipeline { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(StanfordCoreNLPClient.class); /** A simple URL spec, for parsing backend URLs */ private static final Pattern URL_PATTERN = Pattern.compile("(?:(https?)://)?([^:]+):([0-9]+)?"); /** * Information on how to connect to a backend. * The semantics of one of these objects is as follows: * <ul> * <li>It should define a hostname and port to connect to.</li> * <li>This represents ONE thread on the remote server. The client should * treat it as such.</li> * <li>Two backends that are .equals() point to the same endpoint, but there can be * multiple of them if we want to run multiple threads on that endpoint.</li> * </ul> */ private static class Backend { /** The protocol to connect to the server with. */ public final String protocol; /** The hostname of the server running the CoreNLP annotators */ public final String host; /** The port of the server running the CoreNLP annotators */ public final int port; public Backend(String protocol, String host, int port) { this.protocol = protocol; this.host = host; this.port = port; } @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Backend)) return false; Backend backend = (Backend) o; return port == backend.port && protocol.equals(backend.protocol) && host.equals(backend.host); } @Override public int hashCode() { throw new IllegalStateException("Hashing backends is dangerous!"); } @Override public String toString() { return protocol + "://" + host + ":" + port; } } /** * A special type of {@link Thread}, which is responsible for scheduling jobs * on the backend. */ private static class BackendScheduler extends Thread { /** * The list of backends that we can schedule on. * This should not generally be called directly from anywhere */ public final List<Backend> backends; /** * The queue on requests for the scheduler to handle. * Each element of this queue is a function: calling the function signals * that this backend is available to perform a task on the passed backend. * It is then obligated to call the passed Consumer to signal that it has * released control of the backend, and it can be used for other things. * Remember to lock access to this object with {@link BackendScheduler#stateLock}. */ private final Queue<BiConsumer<Backend, Consumer<Backend>>> queue; /** * The lock on access to {@link BackendScheduler#queue}. */ private final Lock stateLock = new ReentrantLock(); /** * Represents the event that an item has been added to the work queue. * Linked to {@link BackendScheduler#stateLock}. */ private final Condition enqueued = stateLock.newCondition(); /** * Represents the event that the queue has become empty, and this schedule is no * longer needed. */ public final Condition shouldShutdown = stateLock.newCondition(); /** * The queue of annotators (backends) that are free to be run on. * Remember to lock access to this object with {@link BackendScheduler#stateLock}. */ private final Queue<Backend> freeAnnotators; /** * Represents the event that an annotator has freed up and is available for * work on the {@link BackendScheduler#freeAnnotators} queue. * Linked to {@link BackendScheduler#stateLock}. */ private final Condition newlyFree = stateLock.newCondition(); /** * While this is true, continue running the scheduler. */ private boolean doRun = true; /** * Create a new scheduler from a list of backends. * These can contain duplicates -- in that case, that many concurrent * calls can be made to that backend. */ public BackendScheduler(List<Backend> backends) { super(); setDaemon(true); this.backends = backends; this.freeAnnotators = new LinkedList<>(backends); this.queue = new LinkedList<>(); } /** {@inheritDoc} */ @Override public void run() { try { while (doRun) { // Wait for a request BiConsumer<Backend, Consumer<Backend>> request; Backend annotator; stateLock.lock(); try { while (queue.isEmpty()) { enqueued.await(); if (!doRun) { return; } } // Get the actual request request = queue.poll(); // We have a request // Find a free annotator while (freeAnnotators.isEmpty()) { newlyFree.await(); } annotator = freeAnnotators.poll(); } finally { stateLock.unlock(); } // We have an annotator // Run the annotation request.accept(annotator, freedAnnotator -> { // ASYNC: we've freed this annotator // add it back to the queue and register it as available stateLock.lock(); try { freeAnnotators.add(freedAnnotator); // If the queue is empty, and all the annotators have returned, we're done if (queue.isEmpty() && freeAnnotators.size() == backends.size()) { log.debug("All annotations completed. Signaling for shutdown"); shouldShutdown.signalAll(); } newlyFree.signal(); } finally { stateLock.unlock(); } }); // Annotator is running (in parallel, most likely) } } catch (InterruptedException e) { throw new RuntimeException(e); } } /** * Schedule a new job on the backend * @param annotate A callback, which will be called when a backend is free * to do some processing. The implementation of this callback * MUST CALL the second argument when it is done processing, * to register the backend as free for further work. */ public void schedule(BiConsumer<Backend, Consumer<Backend>> annotate) { stateLock.lock(); try { queue.add(annotate); enqueued.signal(); } finally { stateLock.unlock(); } } } // end static class BackEndScheduler /** The path on the server to connect to. */ private final String path = ""; /** The Properties file to annotate with. */ private final Properties properties; /** The Properties file to send to the server, serialized as JSON. */ private final String propsAsJSON; /** The API key to authenticate with, or null */ private final String apiKey; /** The API secret to authenticate with, or null */ private final String apiSecret; /** The scheduler to use when running on multiple backends at a time */ private final BackendScheduler scheduler; /** * The annotation serializer responsible for translating between the wire format * (protocol buffers) and the {@link Annotation} classes. */ private final ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer(true); /** * The main constructor. Create a client from a properties file and a list of backends. * Note that this creates at least one Daemon thread. * * @param properties The properties file, as would be passed to {@link StanfordCoreNLP}. * @param backends The backends to run on. * @param apiKey The key to authenticate with as a username * @param apiSecret The key to authenticate with as a password */ private StanfordCoreNLPClient(Properties properties, List<Backend> backends, String apiKey, String apiSecret) { // Save the constructor variables this.properties = properties; Properties serverProperties = new Properties(); for (String key : properties.stringPropertyNames()) { serverProperties.setProperty(key, properties.getProperty(key)); } Collections.shuffle(backends, new Random(System.currentTimeMillis())); this.scheduler = new BackendScheduler(backends); this.apiKey = apiKey; this.apiSecret = apiSecret; // Set required serverProperties serverProperties.setProperty("inputFormat", "serialized"); serverProperties.setProperty("outputFormat", "serialized"); serverProperties.setProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName()); serverProperties.setProperty("outputSerializer", ProtobufAnnotationSerializer.class.getName()); // Create a list of all the properties, as JSON map elements List<String> jsonProperties = serverProperties.stringPropertyNames().stream().map(key -> '"' + JSONOutputter.cleanJSON(key) + "\": \"" + JSONOutputter .cleanJSON(serverProperties.getProperty(key)) + '"') .collect(Collectors.toList()); // Create the JSON object this.propsAsJSON = "{ " + StringUtils.join(jsonProperties, ", ") + " }"; // Start 'er up this.scheduler.start(); } /** * The main constructor without credentials. * * @see StanfordCoreNLPClient#StanfordCoreNLPClient(Properties, List, String, String) */ private StanfordCoreNLPClient(Properties properties, List<Backend> backends) { this(properties, backends, null, null); } /** * Run the client, pulling credentials from the environment. * Throws an IllegalStateException if the required environment variables aren't set. * These are: * * <ul> * <li>CORENLP_HOST</li> * <li>CORENLP_KEY</li> * <li>CORENLP_SECRET</li> * </ul> * * @throws IllegalStateException Thrown if we could not read the required environment variables. */ @SuppressWarnings("unused") public StanfordCoreNLPClient(Properties properties) throws IllegalStateException { this(properties, Optional.ofNullable(System.getenv("CORENLP_HOST")).orElseThrow(() -> new IllegalStateException("Environment variable CORENLP_HOST not specified")), Optional.ofNullable(System.getenv("CORENLP_HOST")).map(x -> x.startsWith("http://") ? 80 : 443).orElse(443), 1, Optional.ofNullable(System.getenv("CORENLP_KEY")).orElse(null), Optional.ofNullable(System.getenv("CORENLP_SECRET")).orElse(null) ); } /** * Run on a single backend. * * @see StanfordCoreNLPClient (Properties, List) */ @SuppressWarnings("unused") public StanfordCoreNLPClient(Properties properties, String host, int port) { this(properties, host, port, 1); } /** * Run on a single backend, with authentication * * @see StanfordCoreNLPClient (Properties, List) */ @SuppressWarnings("unused") public StanfordCoreNLPClient(Properties properties, String host, int port, String apiKey, String apiSecret) { this(properties, host, port, 1, apiKey, apiSecret); } /** * Run on a single backend, with authentication * * @see StanfordCoreNLPClient (Properties, List) */ @SuppressWarnings("unused") public StanfordCoreNLPClient(Properties properties, String host, String apiKey, String apiSecret) { this(properties, host, host.startsWith("http://") ? 80 : 443, 1, apiKey, apiSecret); } /** * Run on a single backend, but with k threads on each backend. * * @see StanfordCoreNLPClient (Properties, List) */ @SuppressWarnings("unused") public StanfordCoreNLPClient(Properties properties, String host, int port, int threads) { this(properties, host, port, threads, null, null); } /** * Run on a single backend, but with k threads on each backend, and with authentication * * @see StanfordCoreNLPClient (Properties, List) */ public StanfordCoreNLPClient(Properties properties, String host, int port, int threads, String apiKey, String apiSecret) { this(properties, new ArrayList<Backend>() {{ for (int i = 0; i < threads; ++i) { add(new Backend(host.startsWith("http://") ? "http" : "https", host.startsWith("http://") ? host.substring("http://".length()) : (host.startsWith("https://") ? host.substring("https://".length()) : host), port)); } }}, apiKey, apiSecret); } /** * {@inheritDoc} * * This method creates an async call to the server, and blocks until the server * has finished annotating the object. */ @Override public void annotate(Annotation annotation) { final Lock lock = new ReentrantLock(); final Condition annotationDone = lock.newCondition(); annotate(Collections.singleton(annotation), 1, (Annotation annInput) -> { try { lock.lock(); annotationDone.signal(); } finally { lock.unlock(); } }); try { lock.lock(); annotationDone.await(); // Only wait for one callback to complete; only annotating one document } catch (InterruptedException e) { log.info("Interrupt while waiting for annotation to return"); } finally { lock.unlock(); } } /** * This method fires off a request to the server. Upon returning, it calls the provided * callback method. * * @param annotations The input annotations to process * @param numThreads The number of threads to run on. IGNORED in this class. * @param callback A function to be called when an annotation finishes. */ @Override public void annotate(final Iterable<Annotation> annotations, int numThreads, final Consumer<Annotation> callback){ for (Annotation annotation : annotations) { annotate(annotation, callback); } } /** * The canonical entry point of the client annotator. * Create an HTTP request, send this annotation to the server, and await a response. * * @param annotation The annotation to annotate. * @param callback Called when the server has returned an annotated document. * The input to this callback is the same as the passed Annotation object. */ public void annotate(final Annotation annotation, final Consumer<Annotation> callback) { scheduler.schedule((Backend backend, Consumer<Backend> isFinishedCallback) -> new Thread(() -> { try { // 1. Create the input // 1.1 Create a protocol buffer ByteArrayOutputStream os = new ByteArrayOutputStream(); serializer.write(annotation, os); os.close(); byte[] message = os.toByteArray(); // 1.2 Create the query params String queryParams = String.format( "properties=%s", URLEncoder.encode(StanfordCoreNLPClient.this.propsAsJSON, "utf-8")); // 2. Create a connection URL serverURL = new URL(backend.protocol, backend.host, backend.port, StanfordCoreNLPClient.this.path + '?' + queryParams); // 3. Do the annotation // This method has two contracts: // 1. It should call the two relevant callbacks // 2. It must not throw an exception doAnnotation(annotation, backend, serverURL, message, 0); } catch (Throwable t) { log.warn("Could not annotate via server! Trying to annotate locally...", t); StanfordCoreNLP corenlp = new StanfordCoreNLP(properties); corenlp.annotate(annotation); } finally { callback.accept(annotation); isFinishedCallback.accept(backend); } }).start()); } /** * Actually try to perform the annotation on the server side. * This is factored out so that we can retry up to 3 times. * * @param annotation The annotation we need to fill. * @param backend The backend we are querying against. * @param serverURL The URL of the server we are hitting. * @param message The message we are sending the server (don't need to recompute each retry). * @param tries The number of times we've tried already. */ @SuppressWarnings("unchecked") private void doAnnotation(Annotation annotation, Backend backend, URL serverURL, byte[] message, int tries) { try { // 1. Set up the connection URLConnection connection = serverURL.openConnection(); // 1.1 Set authentication if (apiKey != null && apiSecret != null) { String userpass = apiKey + ":" + apiSecret; String basicAuth = "Basic " + new String(Base64.getEncoder().encode(userpass.getBytes())); connection.setRequestProperty("Authorization", basicAuth); } // 1.2 Set some protocol-independent properties connection.setDoOutput(true); connection.setRequestProperty("Content-Type", "application/x-protobuf"); connection.setRequestProperty("Content-Length", Integer.toString(message.length)); connection.setRequestProperty("Accept-Charset", "utf-8"); connection.setRequestProperty("User-Agent", StanfordCoreNLPClient.class.getName()); // 1.3 Set some protocol-dependent properties switch (backend.protocol) { case "https": case "http": ((HttpURLConnection) connection).setRequestMethod("POST"); break; default: throw new IllegalStateException("Haven't implemented protocol: " + backend.protocol); } // 2. Annotate // 2.1. Fire off the request connection.connect(); connection.getOutputStream().write(message); connection.getOutputStream().flush(); // 2.2 Await a response // -- It might be possible to send more than one message, but we are not going to do that. Annotation response = serializer.read(connection.getInputStream()).first; // 2.3. Copy response over to original annotation for (Class key : response.keySet()) { annotation.set(key, response.get(key)); } } catch (Throwable t) { // 3. We encountered an error -- retry if (tries < 3) { log.warn(t); doAnnotation(annotation, backend, serverURL, message, tries + 1); } else { throw new RuntimeException(t); } } } /** * Runs the entire pipeline on the content of the given text passed in. * @param text The text to process * @return An Annotation object containing the output of all annotators */ public Annotation process(String text) { Annotation annotation = new Annotation(text); annotate(annotation); return annotation; } /** * Runs an interactive shell where input text is processed with the given pipeline. * * @param pipeline The pipeline to be used * @throws IOException If IO problem with stdin */ private static void shell(StanfordCoreNLPClient pipeline) throws IOException { log.info("Entering interactive shell. Type q RETURN or EOF to quit."); final StanfordCoreNLP.OutputFormat outputFormat = StanfordCoreNLP.OutputFormat.valueOf(pipeline.properties.getProperty("outputFormat", "text").toUpperCase()); IOUtils.console("NLP> ", line -> { if ( ! line.isEmpty()) { Annotation anno = pipeline.process(line); try { switch (outputFormat) { case XML: new XMLOutputter().print(anno, System.out); break; case JSON: new JSONOutputter().print(anno, System.out); System.out.println(); break; case CONLL: new CoNLLOutputter().print(anno, System.out); System.out.println(); break; case TEXT: new TextOutputter().print(anno, System.out); break; case SERIALIZED: warn("You probably cannot read the serialized output, so printing in text instead"); new TextOutputter().print(anno, System.out); break; default: throw new IllegalArgumentException("Cannot output in format " + outputFormat + " from the interactive shell"); } } catch (IOException e) { throw new RuntimeIOException(e); } } }); } /** * The implementation of what to run on a command-line call of CoreNLPWebClient * * @throws IOException If any IO problem */ public void run() throws IOException { StanfordRedwoodConfiguration.minimalSetup(); StanfordCoreNLP.OutputFormat outputFormat = StanfordCoreNLP.OutputFormat.valueOf(properties.getProperty("outputFormat", "text").toUpperCase()); // // Process one file or a directory of files // if (properties.containsKey("file") || properties.containsKey("textFile")) { String fileName = properties.getProperty("file"); if (fileName == null) { fileName = properties.getProperty("textFile"); } Collection<File> files = new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true); StanfordCoreNLP.processFiles(null, files, 1, properties, this::annotate, StanfordCoreNLP.createOutputter(properties, new AnnotationOutputter.Options()), outputFormat); } // // Process a list of files // else if (properties.containsKey("filelist")){ String fileName = properties.getProperty("filelist"); Collection<File> inputFiles = StanfordCoreNLP.readFileList(fileName); Collection<File> files = new ArrayList<>(inputFiles.size()); for (File file : inputFiles) { if (file.isDirectory()) { files.addAll(new FileSequentialCollection(new File(fileName), properties.getProperty("extension"), true)); } else { files.add(file); } } StanfordCoreNLP.processFiles(null, files, 1, properties, this::annotate, StanfordCoreNLP.createOutputter(properties, new AnnotationOutputter.Options()), outputFormat); } // // Run the interactive shell // else { shell(this); } } /** * <p> * Good practice to call after you are done with this object. * Shuts down the queue of annotations to run and the associated threads. * </p> * * <p> * If this is not called, any job which has been scheduled but not run will be * cancelled. * </p> */ public void shutdown() throws InterruptedException { scheduler.stateLock.lock(); try { while (!scheduler.queue.isEmpty() || scheduler.freeAnnotators.size() != scheduler.backends.size()) { scheduler.shouldShutdown.await(5, TimeUnit.SECONDS); } scheduler.doRun = false; scheduler.enqueued.signalAll(); // In case the thread's waiting on this condition } finally { scheduler.stateLock.unlock(); } } /** * This can be used just for testing or for command-line text processing. * This runs the pipeline you specify on the * text in the file that you specify and sends some results to stdout. * The current code in this main method assumes that each line of the file * is to be processed separately as a single sentence. * <p> * Example usage:<br> * java -mx6g edu.stanford.nlp.pipeline.StanfordCoreNLP -props properties -backends site1:port1,site2,port2 <br> * or just -host name -port number * * @param args List of required properties * @throws java.io.IOException If IO problem * @throws ClassNotFoundException If class loading problem */ public static void main(String[] args) throws IOException, ClassNotFoundException { // // process the arguments // // extract all the properties from the command line // if cmd line is empty, set the properties to null. The processor will search for the properties file in the classpath // if (args.length < 2) { // log.info("Usage: " + StanfordCoreNLPClient.class.getSimpleName() + " -host <hostname> -port <port> ..."); // System.exit(1); // } Properties props = StringUtils.argsToProperties(args); boolean hasH = props.containsKey("h"); boolean hasHelp = props.containsKey("help"); if (hasH || hasHelp) { String helpValue = hasH ? props.getProperty("h") : props.getProperty("help"); StanfordCoreNLP.printHelp(System.err, helpValue); return; } // Create the backends List<Backend> backends = new ArrayList<>(); String defaultBack = "http://localhost:9000"; String backStr = props.getProperty("backends"); if (backStr == null) { String host = props.getProperty("host"); String port = props.getProperty("port"); if (host != null) { if (port != null) { defaultBack = host + ':' + port; } else { defaultBack = host; } } } for (String spec : props.getProperty("backends", defaultBack).split(",")) { Matcher matcher = URL_PATTERN.matcher(spec.trim()); if (matcher.matches()) { String protocol = matcher.group(1); if (protocol == null) { protocol = "http"; } String host = matcher.group(2); int port = 80; String portStr = matcher.group(3); if (portStr != null) { port = Integer.parseInt(portStr); } backends.add(new Backend(protocol, host, port)); } } log.info("Using backends: " + backends); // Run the pipeline StanfordCoreNLPClient client = new StanfordCoreNLPClient(props, backends); client.run(); try { client.shutdown(); // In case anything is pending on the server } catch (InterruptedException ignored) { } } // end main() }