/* $Id: HttpPoster.java 988245 2010-08-23 18:39:35Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.agents.output.gts; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.core.common.Base64; import org.apache.manifoldcf.agents.interfaces.*; import org.apache.manifoldcf.agents.system.*; import java.io.*; import java.net.*; import java.nio.charset.StandardCharsets; import java.util.*; import javax.net.*; import javax.net.ssl.*; import org.apache.log4j.*; /** * Posts an input stream to the GTS * */ public class HttpPoster { public static final String _rcsid = "@(#)$Id: HttpPoster.java 988245 2010-08-23 18:39:35Z kwright $"; /** Ingestion buffer size property. */ public static String ingestBufferSizeProperty = "org.apache.manifoldcf.ingest.buffersize"; public static String ingestCredentialsRealm = "org.apache.manifoldcf.ingest.credentialrealm"; public static String ingestResponseRetryCount = "org.apache.manifoldcf.ingest.responseretrycount"; public static String ingestResponseRetryInterval = "org.apache.manifoldcf.ingest.retryinterval"; public static String ingestRescheduleInterval = "org.apache.manifoldcf.ingest.rescheduleinterval"; public static String ingestURIProperty = "org.apache.manifoldcf.ingest.uri"; public static String ingestUserProperty = "org.apache.manifoldcf.ingest.user"; public static String ingestPasswordProperty = "org.apache.manifoldcf.ingest.password"; public static String ingestMaxConnectionsProperty = "org.apache.manifoldcf.ingest.maxconnections"; // Chunk size for base64-encoded headers protected final static int HEADER_CHUNK = 4096; private String encodedCredentials = null; private String realm = null; private String postURI = null; private URL url = null; private URL deleteURL = null; private URL infoURL = null; private String host = null; private int port = 80; private String protocol = null; /** Default buffer size */ private final int buffersize; /** Size coefficient */ private static double sizeCoefficient = 0.0005; // 20 ms additional timeout per 2000 bytes, pulled out of my butt /** the number of times we should poll for the response */ private final int responseRetries; /** how long we should wait before checking for a new stream */ private final long responseRetryWait; /** How long to wait before retrying a failed ingestion */ private final long interruptionRetryTime; /** This is the secure socket factory we will use. I'm presuming it's thread-safe, but * if not, synchronization blocks are in order when it's used. */ protected static javax.net.ssl.SSLSocketFactory secureSocketFactory = null; static { try { secureSocketFactory = getSecureSocketFactory(); } catch (ManifoldCFException e) { // If we can't create, print and fail e.printStackTrace(); System.exit(100); } } /** * Initialized the http poster. * @param userID is the unencoded user name, or null. * @param password is the unencoded password, or null. * @param postURI the uri to post the request to */ public HttpPoster(IThreadContext threadContext, String realm, String userID, String password, String postURI) throws ManifoldCFException { if (userID != null && userID.length() > 0 && password != null) { this.encodedCredentials = new org.apache.manifoldcf.core.common.Base64().encodeByteArray((userID+":"+password).getBytes(StandardCharsets.UTF_8)); this.realm = realm; } this.postURI = postURI; // Create a URL to GTS try { url = new URL(postURI); deleteURL = new URL(postURI+"?DELETE"); infoURL = new URL(postURI+"?STATUS"); } catch (MalformedURLException murl) { throw new ManifoldCFException("Bad url",murl); } // set the port port = url.getPort(); host = url.getHost(); protocol = url.getProtocol(); if (port == -1) { if (protocol.equalsIgnoreCase("https")) port = 443; else port = 80; } buffersize = LockManagerFactory.getIntProperty(threadContext,ingestBufferSizeProperty,32768); responseRetries = LockManagerFactory.getIntProperty(threadContext,ingestResponseRetryCount,9000); responseRetryWait = LockManagerFactory.getIntProperty(threadContext,ingestResponseRetryInterval,20); interruptionRetryTime = LockManagerFactory.getIntProperty(threadContext,ingestRescheduleInterval,60000); } /** * Post the input stream to ingest * @param documentURI is the document's uri. * @param document is the document structure to ingest. * @return true if the ingestion was successful, or false if the ingestion is illegal. * @throws ManifoldCFException, ServiceInterruption */ public boolean indexPost(String documentURI, List<String> collections, String documentTemplate, String authorityNameString, RepositoryDocument document, IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption { StringBuilder aclXml = new StringBuilder(); Iterator<String> securityTypeIterator = document.securityTypesIterator(); String[] shareAcls = null; String[] shareDenyAcls = null; String[] documentAcls = null; String[] documentDenyAcls = null; String[] parentAcls = null; String[] parentDenyAcls = null; while (securityTypeIterator.hasNext()) { String securityType = securityTypeIterator.next(); if (securityType.equals(RepositoryDocument.SECURITY_TYPE_SHARE)) { shareAcls = document.getSecurityACL(securityType); shareDenyAcls = document.getSecurityDenyACL(securityType); } else if (securityType.equals(RepositoryDocument.SECURITY_TYPE_DOCUMENT)) { documentAcls = document.getSecurityACL(securityType); documentDenyAcls = document.getSecurityDenyACL(securityType); } else if (securityType.equals(RepositoryDocument.SECURITY_TYPE_PARENT)) { parentAcls = document.getSecurityACL(securityType); parentDenyAcls = document.getSecurityDenyACL(securityType); } else // Can't accept the document, because we don't know how to secure it activities.recordActivity(null,GTSConnector.INGEST_ACTIVITY,null,documentURI,activities.UNKNOWN_SECURITY,"Rejected document that has security info which GTS does not recognize: '"+ securityType + "'"); return false; } writeACLs(aclXml,"share",shareAcls,shareDenyAcls,authorityNameString,activities); writeACLs(aclXml,"directory",parentAcls,parentDenyAcls,authorityNameString,activities); writeACLs(aclXml,"file",documentAcls,documentDenyAcls,authorityNameString,activities); if (aclXml.length() > 0) aclXml.append("</document-acl>"); String aclXmlString = aclXml.toString(); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("indexPost(): '" + documentURI + "'"); // This flag keeps track of whether we read anything from the input stream yet. // If not, we can retry here. If so, we have to reschedule. boolean readFromDocumentStreamYet = false; int ioErrorRetry = 3; while (true) { try { IngestThread t = new IngestThread(documentURI,aclXmlString,collections,documentTemplate,document); try { t.start(); t.join(); // Log the activity, if any, regardless of any exception if (t.getActivityCode() != null) activities.recordActivity(t.getActivityStart(),GTSConnector.INGEST_ACTIVITY,t.getActivityBytes(),documentURI,t.getException().getClass().getSimpleName().toUpperCase(Locale.ROOT),t.getActivityDetails()); readFromDocumentStreamYet = (readFromDocumentStreamYet || t.getReadFromDocumentStreamYet()); Throwable thr = t.getException(); if (thr != null) { if (thr instanceof ServiceInterruption) throw (ServiceInterruption)thr; if (thr instanceof ManifoldCFException) throw (ManifoldCFException)thr; if (thr instanceof IOException) throw (IOException)thr; if (thr instanceof RuntimeException) throw (RuntimeException)thr; else throw (Error)thr; } return t.getRval(); } catch (InterruptedException e) { t.interrupt(); throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED); } } catch (java.net.SocketTimeoutException ioe) { if (readFromDocumentStreamYet || ioErrorRetry == 0) { // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO error connecting to ingestion API: "+ioe.getMessage()+"; ingestion will be retried again later", ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } } catch (IOException ioe) { if (readFromDocumentStreamYet || ioErrorRetry == 0) { // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO error ingesting document: "+ioe.getMessage()+"; ingestion will be retried again later", ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } } // Sleep for a time, and retry try { ManifoldCF.sleep(10000L); } catch (InterruptedException e) { throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED); } ioErrorRetry--; // Go back around again! } } /** Write acls into a StringBuilder */ protected static void writeACLs(StringBuilder aclXml, String type, String[] acl, String[] denyAcl, String authorityNameString, IOutputAddActivity activities) throws ManifoldCFException { if (acl != null && acl.length > 0 || denyAcl != null && denyAcl.length > 0) { if (aclXml.length() == 0) aclXml.append("<document-acl>"); aclXml.append("<acl scope=\"").append(type).append("\">"); if (acl != null) { for (int i=0; i < acl.length; i++) { if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Adding "+type+" ACL: " + acl[i]); aclXml.append("<allow>"); aclXml.append(activities.qualifyAccessToken(authorityNameString,acl[i])); aclXml.append("</allow>"); } } if (denyAcl != null) { for (int i=0; i < denyAcl.length; i++) { if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Adding "+type+" deny ACL: " + denyAcl[i]); aclXml.append("<deny>"); aclXml.append(activities.qualifyAccessToken(authorityNameString,denyAcl[i])); aclXml.append("</deny>"); } } aclXml.append("</acl>"); } } /** Post a check request. */ public void checkPost() throws ManifoldCFException, ServiceInterruption { if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("checkPost()"); int ioErrorRetry = 5; while (true) { // Open a socket to ingest, and to the response stream to get the post result try { StatusThread t = new StatusThread(); try { t.start(); t.join(); Throwable thr = t.getException(); if (thr != null) { if (thr instanceof ServiceInterruption) throw (ServiceInterruption)thr; if (thr instanceof ManifoldCFException) throw (ManifoldCFException)thr; if (thr instanceof IOException) throw (IOException)thr; if (thr instanceof RuntimeException) throw (RuntimeException)thr; else throw (Error)thr; } return; } catch (InterruptedException e) { t.interrupt(); throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED); } } catch (IOException ioe) { if (ioErrorRetry == 0) { long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO exception checking: "+ioe.getMessage(), ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } } // Go back around again! // Sleep for a time, and retry try { ManifoldCF.sleep(10000L); } catch (InterruptedException e) { throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED); } ioErrorRetry--; } } /** Post a delete request. *@param documentURI is the document's URI. */ public void deletePost(String documentURI, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption { if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("deletePost(): '" + documentURI + "'"); int ioErrorRetry = 5; while (true) { try { DeleteThread t = new DeleteThread(documentURI); try { t.start(); t.join(); // Log the activity, if any, regardless of any exception if (t.getActivityCode() != null) activities.recordActivity(t.getActivityStart(),GTSConnector.REMOVE_ACTIVITY,null,documentURI,t.getException().getClass().getSimpleName().toUpperCase(Locale.ROOT),t.getActivityDetails()); Throwable thr = t.getException(); if (thr != null) { if (thr instanceof ServiceInterruption) throw (ServiceInterruption)thr; if (thr instanceof ManifoldCFException) throw (ManifoldCFException)thr; if (thr instanceof IOException) throw (IOException)thr; if (thr instanceof RuntimeException) throw (RuntimeException)thr; else throw (Error)thr; } return; } catch (InterruptedException e) { t.interrupt(); throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED); } } catch (IOException ioe) { if (ioErrorRetry == 0) { long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO exception deleting: "+ioe.getMessage()+"; deletion will be retried again later", ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } // Fall through and recycle } // Go back around again! // Sleep for a time, and retry try { ManifoldCF.sleep(10000L); } catch (InterruptedException e) { throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED); } ioErrorRetry--; } } /** * Get the response code of the post * @param stream the stream the response is going to come from * @return the response string * @throws ManifoldCFException */ protected String getResponse(BufferedReader stream) throws ManifoldCFException, ServiceInterruption { Logging.ingest.debug("Waiting for response stream"); StringBuilder res = new StringBuilder(); try { // Stream.ready() always returns false for secure sockets :-(. So // we have to rely on socket timeouts to interrupt us if the server goes down. while (true) { int i = stream.read(); if (i == -1) break; res.append((char) i); } Logging.ingest.debug("Read of response stream complete"); } catch (java.net.SocketTimeoutException e) { // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Ingestion API socket timeout exception waiting for response code: "+e.getMessage()+"; ingestion will be retried again later", e, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED); } catch (java.net.ConnectException e) { // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out connecting to ingestion API: "+e.getMessage()+"; ingestion will be retried again later", e, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } catch (java.net.SocketException e) { // Return 400 error; likely a connection reset which lost us the response data, so // just treat it as something OK. return "HTTP/1.0 400 Connection Reset"; } catch (IOException ioe) { Logging.ingest.warn("IO exception trying to get response from ingestion API: "+ioe.getMessage(),ioe); // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO exception waiting for response code: "+ioe.getMessage()+"; ingestion will be retried again later", ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } return res.toString(); } /** Write credentials to output */ protected void writeCredentials(OutputStream out) throws IOException { // Apply credentials if present if (encodedCredentials != null) { Logging.ingest.debug("Applying credentials"); byte[] tmp = ("Authorization: Basic " + encodedCredentials + "\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); tmp = ("WWW-Authenticate: Basic realm=\"" + ((realm != null) ? realm : "") + "\"\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); } } /** Encode for metadata. *@param inputString is the input string. *@return output, encoded. */ protected static String metadataEncode(String inputString) { StringBuilder rval = new StringBuilder(); int i = 0; while (i < inputString.length()) { char x = inputString.charAt(i++); // Certain characters must simply be skipped, because they are illegal in header fields. if (x >= ' ' && x <= (char)127) { if (x == '\\' || x == ',') rval.append('\\'); rval.append(x); } } return rval.toString(); } /** Build a secure socket factory based on no keystore and a lax trust manager. * This allows use of SSL for privacy but not identification. */ protected static javax.net.ssl.SSLSocketFactory getSecureSocketFactory() throws ManifoldCFException { try { java.security.SecureRandom secureRandom = java.security.SecureRandom.getInstance("SHA1PRNG"); // Create an SSL context javax.net.ssl.SSLContext sslContext = javax.net.ssl.SSLContext.getInstance("SSL"); sslContext.init(null,new LaxTrustManager[]{new LaxTrustManager()},secureRandom); return sslContext.getSocketFactory(); } catch (java.security.NoSuchAlgorithmException e) { throw new ManifoldCFException("No such algorithm",e); } catch (java.security.KeyManagementException e) { throw new ManifoldCFException("Key management exception",e); } } /** Create a socket in a manner consistent with all of our specified parameters. */ protected Socket createSocket(long responseRetryCount) throws IOException, ManifoldCFException { Socket socket; if (protocol.equals("https")) { try { SocketFactory factory = SSLSocketFactory.getDefault(); socket = factory.createSocket(host,port); } catch (InterruptedIOException e) { throw e; } catch (IOException e) { throw new ManifoldCFException("Couldn't set up SSL connection to ingestion API: "+e.getMessage(),e); } } else socket = new Socket(host, port); // Calculate the timeout we want long timeoutMilliseconds = responseRetryWait * responseRetryCount; socket.setSoTimeout((int)timeoutMilliseconds); return socket; } /** Our own trust manager, which ignores certificate issues */ protected static class LaxTrustManager implements X509TrustManager { /** Does nothing */ public LaxTrustManager() { } /** Return a list of accepted issuers. There are none. */ public java.security.cert.X509Certificate[] getAcceptedIssuers() { return new java.security.cert.X509Certificate[0]; } /** We have no problem with any clients */ public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { } /** We have no problem with any servers */ public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { } } /** Killable thread that does ingestions. * Java 1.5 stopped permitting thread interruptions to abort socket waits. As a result, it is impossible to get threads to shutdown cleanly that are doing * such waits. So, the places where this happens are segregated in their own threads so that they can be just abandoned. * * This thread does a single document ingestion. */ protected class IngestThread extends java.lang.Thread { protected String documentURI; protected String aclXmlString; protected List<String> collections; protected String documentTemplate; protected RepositoryDocument document; protected Long activityStart = null; protected Long activityBytes = null; protected String activityCode = null; protected String activityDetails = null; protected Throwable exception = null; protected boolean readFromDocumentStreamYet = false; protected boolean rval = false; public IngestThread(String documentURI, String aclXmlString, List<String> collections, String documentTemplate, RepositoryDocument document) { super(); setDaemon(true); this.documentURI = documentURI; this.aclXmlString = aclXmlString; this.collections = collections; this.documentTemplate = documentTemplate; this.document = document; } public void run() { long length = document.getBinaryLength(); InputStream is = document.getBinaryStream(); try { // Do the operation! long fullStartTime = System.currentTimeMillis(); // Open a socket to ingest, and to the response stream to get the post result try { // Set up the socket, and the (optional) secure socket. long responseRetryCount = responseRetries + (long)((float)length * sizeCoefficient); Socket socket = createSocket(responseRetryCount); try { InputStreamReader isr = new InputStreamReader(socket.getInputStream(),"ASCII"); try { BufferedReader in = new BufferedReader(isr); try { OutputStream out = socket.getOutputStream(); try { // Create the output stream to GTS String uri = url.getFile(); if (uri.length() == 0) uri = "/"; byte[] tmp = ("POST " + uri + " HTTP/1.0\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); // Set all the headers tmp = ("Document-URI: " + documentURI + "\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); writeCredentials(out); // Apply ACL if present if (aclXmlString.length() > 0) { String encodedACL = new Base64().encodeByteArray(aclXmlString.getBytes(StandardCharsets.UTF_8)); // Break into chunks - 4K each - 'cause otherwise we blow up the ingester. int index = 0; while (true) { if (index + HEADER_CHUNK >= encodedACL.length()) { tmp = ("Document-ACL: " + encodedACL.substring(index) + "\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); break; } tmp = ("Document-ACL: " + encodedACL.substring(index,index + HEADER_CHUNK) + "\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); index += HEADER_CHUNK; } } // Do the collections if (collections != null) { for (String collectionName : collections) { String encodedValue = metadataEncode(collectionName); //System.out.println("collection metadata: collection_name = '"+encodedValue+"'"); tmp = ("Document-Metadata: collection_name="+encodedValue+"\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); } } // Do the document template if (documentTemplate != null && documentTemplate.length() > 0) { String encodedTemplate = new Base64().encodeByteArray(documentTemplate.getBytes(StandardCharsets.UTF_8)); // Break into chunks - 4K each - 'cause otherwise we blow up the ingester. int index = 0; while (true) { if (index + HEADER_CHUNK >= encodedTemplate.length()) { tmp = ("Document-Template: " + encodedTemplate.substring(index) + "\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); break; } tmp = ("Document-Template: " + encodedTemplate.substring(index,index + HEADER_CHUNK) + "\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); index += HEADER_CHUNK; } } // Write all the metadata, if any Iterator<String> iter = document.getFields(); while (iter.hasNext()) { String fieldName = iter.next(); String[] values = document.getFieldAsStrings(fieldName); // We only handle strings right now!!! int k = 0; while (k < values.length) { String value = (String)values[k++]; String encodedValue = metadataEncode(value); //System.out.println("Metadata: Name = '"+fieldName+"', value = '"+encodedValue+"'"); tmp = ("Document-Metadata: "+ fieldName+"="+encodedValue+"\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); } } tmp = ("Content-length: " + new Long(length).toString() + "\r\n\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); long total = 0; long now, later; now = System.currentTimeMillis(); byte[] bytes = new byte[buffersize]; // Write out the contents of the inputstream to the socket while (true) { int count; // Specially catch all errors that come from reading the input stream itself. // This will help us segregate errors that come from the stream vs. those that come from the ingestion system. try { count = is.read(bytes); } catch (java.net.SocketTimeoutException ioe) { // We have to catch socket timeout exceptions specially, because they are derived from InterruptedIOException // They are otherwise just like IOExceptions // Log the error Logging.ingest.warn("Error reading data for transmission to Ingestion API: "+ioe.getMessage(),ioe); activityStart = new Long(fullStartTime); activityCode = "-1"; activityDetails = "Couldn't read document: "+ioe.getMessage(); // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO error reading document for ingestion: "+ioe.getMessage()+"; read will be retried again later", ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } catch (InterruptedIOException ioe) { // If the transfer was interrupted, it may be because we are shutting down the thread. // Third-party library exceptions derived from InterruptedIOException are possible; if the stream comes from httpclient especially. // If we see one of these, we treat it as "not an interruption". if (!ioe.getClass().getName().equals("java.io.InterruptedIOException")) { // Log the error Logging.ingest.warn("Error reading data for transmission to Ingestion API: "+ioe.getMessage(),ioe); activityStart = new Long(fullStartTime); activityCode = "-1"; activityDetails = "Couldn't read document: "+ioe.getMessage(); // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO error reading document for ingestion: "+ioe.getMessage()+"; read will be retried again later", ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } else throw ioe; } catch (IOException ioe) { // We need to decide whether to throw a service interruption or metacarta exception, based on what went wrong. // We never retry here; the cause is the repository, so there's not any point. // Log the error Logging.ingest.warn("Error reading data for transmission to Ingestion API: "+ioe.getMessage(),ioe); activityStart = new Long(fullStartTime); activityCode = "-1"; activityDetails = "Couldn't read document: "+ioe.getMessage(); // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO error reading document for ingestion: "+ioe.getMessage()+"; read will be retried again later", ioe, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } if (count == -1) break; readFromDocumentStreamYet = true; out.write(bytes,0,count); total += (long)count; } later = System.currentTimeMillis(); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Total bytes posted: " + new Long(total).toString() + ", total time: " + (later - now)); out.flush(); // Now, process response String res; try { res = getResponse(in); } catch (ServiceInterruption si) { activityStart = new Long(now); activityCode = "-2"; activityDetails = si.getMessage(); throw si; } if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Response code from ingest: '" + res + "'"); CodeDetails cd = new CodeDetails(res); activityStart = new Long(now); activityBytes = new Long(length); activityCode = cd.getCode(); activityDetails = cd.getDetails(); int codeValue = cd.getCodeValue(); // A negative number means http error of some kind. if (codeValue < 0) throw new ManifoldCFException("Http protocol error"); // 200 means everything went OK if (codeValue == 200) { rval = true; return; } // Anything else means the document didn't ingest. // There are three possibilities here: // 1) The document will NEVER ingest (it's illegal), in which case a 400 or 403 will be returned, and // 2) There is a transient error, in which case we will want to try again, after a wait. // If the situation is (2), then we CAN'T retry if we already read any of the stream; therefore // we are forced to throw a "service interrupted" exception, and let the caller reschedule // the ingestion. // 3) Something is wrong with the setup, e.g. bad credentials. In this case we chuck a ManifoldCFException, // since this will abort the current activity entirely. if (codeValue == 401) throw new ManifoldCFException("Bad credentials for ingestion",ManifoldCFException.SETUP_ERROR); if (codeValue >= 400 && codeValue < 500) { rval = false; return; } // If this continues, we should indeed abort the job. Retries should not go on indefinitely either; 2 hours is plenty long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Error "+Integer.toString(codeValue)+" from ingestion request; ingestion will be retried again later", new ManifoldCFException("Ingestion HTTP error code "+Integer.toString(codeValue)), currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } finally { out.close(); } } finally { in.close(); } } finally { isr.close(); } } finally { try { socket.close(); } catch (InterruptedIOException e) { throw e; } catch (IOException e) { Logging.ingest.debug("Error closing socket: "+e.getMessage(),e); // Do NOT rethrow } } } catch (java.net.SocketTimeoutException ioe) { // These are just like IO errors, but since they are derived from InterruptedIOException, they have to be caught first. // Log the error Logging.ingest.warn("Error connecting to ingestion API: "+ioe.getMessage(),ioe); activityStart = new Long(fullStartTime); activityCode = "-1"; activityDetails = ioe.getMessage(); throw ioe; } catch (InterruptedIOException e) { return; } catch (IOException ioe) { activityStart = new Long(fullStartTime); // Intercept "broken pipe" exception, since that seems to be what we get if the ingestion API kills the socket right after a 400 goes out. // Basically, we have no choice but to interpret that in the same manner as a 400, since no matter how we do it, it's a race and the 'broken pipe' // result is always possible. So we might as well expect it and treat it properly. // if (ioe.getClass().getName().equals("java.net.SocketException") && ioe.getMessage().toLowerCase(Locale.ROOT).indexOf("broken pipe") != -1) { // We've seen what looks like the ingestion interface forcibly closing the socket. // We *choose* to interpret this just like a 400 response. However, we log in the history using a different code, // since we really don't know what happened for sure. // Record the attempt activityCode = "-103"; activityDetails = "Presuming an ingestion rejection: "+ioe.getMessage(); rval = false; return; } // Record the attempt activityCode = "-1"; activityDetails = ioe.getMessage(); // Log the error Logging.ingest.warn("Error communicating with Ingestion API: "+ioe.getMessage(),ioe); throw ioe; } } catch (Throwable e) { this.exception = e; } } public Throwable getException() { return exception; } public Long getActivityStart() { return activityStart; } public Long getActivityBytes() { return activityBytes; } public String getActivityCode() { return activityCode; } public String getActivityDetails() { return activityDetails; } public boolean getReadFromDocumentStreamYet() { return readFromDocumentStreamYet; } public boolean getRval() { return rval; } } /** Killable thread that does deletions. * Java 1.5 stopped permitting thread interruptions to abort socket waits. As a result, it is impossible to get threads to shutdown cleanly that are doing * such waits. So, the places where this happens are segregated in their own threads so that they can be just abandoned. * * This thread does a single document deletion. */ protected class DeleteThread extends java.lang.Thread { protected String documentURI; protected Long activityStart = null; protected String activityCode = null; protected String activityDetails = null; protected Throwable exception = null; public DeleteThread(String documentURI) { super(); setDaemon(true); this.documentURI = documentURI; } public void run() { try { // Do the operation! long fullStartTime = System.currentTimeMillis(); // Open a socket to ingest, and to the response stream to get the post result try { // Set up the socket, and the (optional) secure socket. Socket socket = createSocket(responseRetries); try { InputStreamReader isr = new InputStreamReader(socket.getInputStream(),"ASCII"); try { BufferedReader in = new BufferedReader(isr); try { OutputStream out = socket.getOutputStream(); try { long startTime = System.currentTimeMillis(); // Create the output stream to GTS byte[] tmp = ("POST " + deleteURL.getFile() + " HTTP/1.0\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); // Set all the headers tmp = ("Document-URI: " + documentURI + "\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); writeCredentials(out); tmp = ("Content-length: 0\r\n\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Delete posted"); out.flush(); String res; try { res = getResponse(in); } catch (ServiceInterruption si) { activityStart = new Long(startTime); activityCode = "-2"; activityDetails = si.getMessage(); throw si; } if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Response code from delete: '" + res + "'"); CodeDetails cd = new CodeDetails(res); activityStart = new Long(startTime); activityCode = cd.getCode(); activityDetails = cd.getDetails(); int codeValue = cd.getCodeValue(); if (codeValue < 0) throw new ManifoldCFException("Http protocol error"); // 200 means everything went OK if (codeValue == 200) return; // We ignore everything in the range from 400-500 now if (codeValue == 401) throw new ManifoldCFException("Bad credentials for ingestion",ManifoldCFException.SETUP_ERROR); if (codeValue >= 400 && codeValue < 500) return; // Anything else means the document didn't delete. Throw the error. throw new ManifoldCFException("Error deleting document: '"+res+"'"); } finally { out.close(); } } finally { in.close(); } } finally { isr.close(); } } finally { try { socket.close(); } catch (InterruptedIOException e) { throw e; } catch (IOException e) { Logging.ingest.debug("Error closing socket: "+e.getMessage(),e); // Do NOT rethrow } } } catch (InterruptedIOException ioe) { return; } catch (IOException ioe) { // Log the error Logging.ingest.warn("Error communicating with Ingestion API: "+ioe.getMessage(),ioe); activityStart = new Long(fullStartTime); activityCode = "-1"; activityDetails = ioe.getMessage(); throw ioe; } } catch (Throwable e) { this.exception = e; } } public Throwable getException() { return exception; } public Long getActivityStart() { return activityStart; } public String getActivityCode() { return activityCode; } public String getActivityDetails() { return activityDetails; } } /** Killable thread that does a status check. * Java 1.5 stopped permitting thread interruptions to abort socket waits. As a result, it is impossible to get threads to shutdown cleanly that are doing * such waits. So, the places where this happens are segregated in their own threads so that they can be just abandoned. * * This thread does a status check. */ protected class StatusThread extends java.lang.Thread { protected Throwable exception = null; public StatusThread() { super(); setDaemon(true); } public void run() { try { // Do the operation! // Open a socket to ingest, and to the response stream to get the post result try { // Set up the socket, and the (optional) secure socket. Socket socket = createSocket(responseRetries); try { InputStreamReader isr = new InputStreamReader(socket.getInputStream(),"ASCII"); try { BufferedReader in = new BufferedReader(isr); try { OutputStream out = socket.getOutputStream(); try { // Create the output stream to GTS byte[] tmp = ("GET " + infoURL.getFile() + " HTTP/1.0\r\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); writeCredentials(out); tmp = ("Content-length: 0\r\n\n").getBytes(StandardCharsets.UTF_8); out.write(tmp, 0, tmp.length); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Status request posted"); out.flush(); String res = getResponse(in); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Response code from delete: '" + res + "'"); CodeDetails cd = new CodeDetails(res); int codeValue = cd.getCodeValue(); if (codeValue < 0) throw new ManifoldCFException("Http protocol error"); // 200 means everything went OK if (codeValue == 200) return; // We ignore everything in the range from 400-500 now if (codeValue == 401) throw new ManifoldCFException("Bad credentials for ingestion",ManifoldCFException.SETUP_ERROR); // Anything else means the info request failed. throw new ManifoldCFException("Error connecting to MetaCarta ingestion API: '"+res+"'"); } finally { out.close(); } } finally { in.close(); } } finally { isr.close(); } } finally { try { socket.close(); } catch (InterruptedIOException e) { throw e; } catch (IOException e) { Logging.ingest.debug("Error closing socket: "+e.getMessage(),e); // Do NOT rethrow } } } catch (InterruptedIOException ioe) { // Exit the thread. return; } catch (IOException ioe) { // Log the error Logging.ingest.warn("Error communicating with Ingestion API: "+ioe.getMessage(),ioe); throw ioe; } } catch (Throwable e) { this.exception = e; } } public Throwable getException() { return exception; } } /** Code+details paper object */ protected static class CodeDetails { protected String code; protected int codeValue; protected String details; public CodeDetails(String res) { codeValue = -100; code = "-100"; details = "Http response was improperly formed"; int firstSpace = res.indexOf(" "); if (firstSpace != -1) { int secondSpace = res.indexOf(" ", firstSpace + 1); if (secondSpace != -1) { code = res.substring(firstSpace + 1, secondSpace); details = res.substring(secondSpace+1).trim(); try { codeValue = (int)(new Double(code).doubleValue()); if (codeValue == 200) details = null; } catch (NumberFormatException e) { // Fall through and leave codeValue unaltered } } } } public String getCode() { return code; } public int getCodeValue() { return codeValue; } public String getDetails() { return details; } } }