/* * (C) Copyright IBM Corp. 2011 * * LICENSE: Eclipse Public License v1.0 * http://www.eclipse.org/legal/epl-v10.html */ package com.ibm.db2j; import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.nio.charset.Charset; import java.security.NoSuchAlgorithmException; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.SQLSyntaxErrorException; import java.sql.Statement; import java.util.ArrayList; import java.util.Arrays; import java.util.Hashtable; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import java.util.concurrent.BlockingDeque; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPOutputStream; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.derby.iapi.error.StandardException; import org.apache.derby.iapi.store.access.Qualifier; import org.apache.derby.iapi.types.DataValueDescriptor; import org.apache.derby.vti.IFastPath; import org.apache.derby.vti.VTIEnvironment; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import sun.misc.BASE64Encoder; import com.ibm.gaiandb.GaianChildRSWrapper; import com.ibm.gaiandb.GaianDBConfig; import com.ibm.gaiandb.Logger; import com.ibm.gaiandb.RowsFilter; import com.ibm.gaiandb.SecurityManager; import com.ibm.gaiandb.Util; import com.ibm.gaiandb.diags.GDBMessages; import com.ibm.gaiandb.policyframework.SQLResultFilter; import com.ibm.gaiandb.policyframework.SQLResultFilterX; /** * TODO * * ICA REST VTI - uses a URL defined in a .icarest file, combined with * some properties in the gaindb_config.properties file to query an ICA REST * endpoint and return the results in a tabular format. * * Sample config lines in gaiandb_config.properties: * ================================================= * * com.ibm.db2j.ICAREST.search.schema=DTITLE VARCHAR(100), DURL VARCHAR(1000), DNUM INT, RELEVANCE DOUBLE, DCONTEXT CLOB(32K) * com.ibm.db2j.ICAREST.search.url=http://localhost:8394/api/v10/search?output=application/xml&scope=All&results=1250 * * com.ibm.db2j.ICAREST.doctext.schema=DOCUMENT CLOB(10M) * com.ibm.db2j.ICAREST.doctext.url=http://localhost:8394/api/v10/search/preview?query=search&collection= * * com.ibm.db2j.ICAREST.docbytes.schema=DOCUMENT BLOB(10M) * com.ibm.db2j.ICAREST.docbytes.url=http://localhost:8393/search/ESFetchServlet?cid= * * Sample variable URI config: * =========================== * * In file search.icarest: &query=$1 * In file doctext.icarest: $1 * in file docstream.icarest: $1 * * Sample queries: * =============== * * Search for documents containing the word 'Arms' * select * from new com.ibm.db2j.ICAREST('search,Arms') IR * * Extract the doctext for the DOC * select * from new com.ibm.db2j.ICAREST('doctext,HUMINT&uri=file:///C:/%2524user/VMData/EDAData/HUMINT/HUMINT%2BDoha%2BTalks%2Bon%2BDarfur.doc') IR * * @author Stephen Nicholas / Ed Jellard / David Vyvyan * */ public class ICAREST extends AbstractDurableCacheVTI { // Use PROPRIETARY notice if class contains a main() method, otherwise use COPYRIGHT notice. public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2011"; private static final Logger logger = new Logger( "ICAREST", 20 ); private static final String FETCH_BUFFER_SIZE = "fetchbuffersize"; private static final int FETCH_BUFFER_SIZE_DEFAULT = 20; private static final int FETCH_BATCH_SIZE_DEFAULT = 10; //Used to notify the bufferPopulator thread when runQuery has finished fetching results. private static final String END_OF_RESULTS_IDENTIFIER = "END_OF_RESULTS_IDENTIFIER"; private static final String FUNCTION_COUNT = "count"; private static final String FUNCTION_SEARCH = "search"; private static final String FUNCTION_DOCTEXT = "doctext"; private static final String FUNCTION_DOCBYTES = "docbytes"; private static final String RESULTS_QUERY_PARAM = "&results="; private static final String START_QUERY_PARAM = "&start="; private static final Hashtable<String, String> schemas = new Hashtable<String, String>() { private static final long serialVersionUID = 1L; { put( FUNCTION_COUNT, "DCOUNT BIGINT, CACHEID INT" ); put( FUNCTION_SEARCH, "DTITLE VARCHAR(256), DURL VARCHAR(1000), DNUM INT, RELEVANCE DOUBLE, DCONTEXT CLOB(32K), CACHEID INT" ); put( FUNCTION_DOCTEXT, "DSIZE INT, DTEXT CLOB(10M), CACHEID INT" ); put( FUNCTION_DOCBYTES, "DNAME VARCHAR(256), DTYPE VARCHAR(50), DSIZE INT, DBYTES BLOB(10M), CACHEID INT" ); }}; // Primary keys are mainly used here to avoid writing duplicates private static final Hashtable<String, String> primaryKeys = new Hashtable<String, String>() { private static final long serialVersionUID = 1L; { put( FUNCTION_COUNT, "CACHEID" ); put( FUNCTION_SEARCH, "CACHEID, DNUM" ); put( FUNCTION_DOCTEXT, "CACHEID" ); put( FUNCTION_DOCBYTES, "CACHEID" ); }}; private static final String cacheExpirySeconds = "60"; private static final String PROP_URL = "url"; private static final String ICA_FETCHER_NAME = "ICARESTFetcher"; private static final String BUFFER_POPULATOR_NAME = "BufferPopulator"; // private static Map<String, AtomicLong> completedQueryCaches = new ConcurrentHashMap<String,AtomicLong>(); // private static Map<String, Lock> cacheLockMap = new ConcurrentHashMap<String, Lock>(); private int currentRow = 0; private NodeList results = null; private int totalResults = 0, startIndex = 0, itemsPerPage = 0; private String urlString = null; private String docName = null, docType = null, docText = null; private byte[] docBytes = null; private final String vtiArgs; private int fetchBatchSize = FETCH_BATCH_SIZE_DEFAULT; private int fetchBufferSize = FETCH_BUFFER_SIZE_DEFAULT; private final BlockingDeque<DataValueDescriptor[][]> fetchBuffer; private final DataValueDescriptor[] resultRowTemplate; private DataValueDescriptor[][] currentResultBatch; private int currentResultBatchIndex = 0; private boolean policyFilterDefined = false; private SQLResultFilter sqlResultFilter; private SQLResultFilterX sqlResultFilterX; private final BlockingDeque<String> bufferPopulatorWorkQueue; private int maxSourceRows = -1; private int newStartIndex; private boolean queryRunning = false; private Qualifier[][] qualifiers; private Map<String, Integer> cacheErrors = new Hashtable<String, Integer>(); private Map<String, Object []> fieldModificationCountMap = new Hashtable<String, Object []>(); public Hashtable<String, String> getDefaultVTIProperties() { if ( null == defaultVTIProperties ) { Hashtable<String, String> props = super.getDefaultVTIProperties(); String prefix = getPrefix(); if(schemas.containsKey(prefix)) { props.put(PROP_SCHEMA, schemas.get(prefix)); } // Do not define default properties for the URLs because we want the ICAREST to be disabled when URLs are not defined in the config file. // props.put(PROP_URL, urls.get(prefix)); props.put(PROP_CACHE_EXPIRES, cacheExpirySeconds); if(primaryKeys.containsKey(prefix)) { props.put(PROP_CACHE_PKEY, primaryKeys.get(prefix)); } defaultVTIProperties = props; } return defaultVTIProperties; } private int instanceId; private boolean isSearch = false; private boolean isCount = false; private boolean isDocBytes = false; private boolean isDocText = false; private boolean cachingExplicitlyDisabled = false; public ICAREST(String vtiArgs) throws Exception { super(vtiArgs); this.vtiArgs = vtiArgs; this.instanceId = this.hashCode(); /* * What kind of thing are we doing? */ if(FUNCTION_SEARCH.equals(getPrefix())) { isSearch = true; } else if(FUNCTION_COUNT.equals(getPrefix())) { isCount = true; } else if(FUNCTION_DOCBYTES.equals(getPrefix())) { isDocBytes = true; } else if(FUNCTION_DOCTEXT.equals(getPrefix())) { isDocText = true; } else { logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_INVALID_FUNCTION_ERROR, "The function type: '" + getPrefix() + "' is not recognised by ICAREST."); throw new Exception("The function type: '" + getPrefix() + "' is not recognised by ICAREST."); } logger.logImportant("Entered ICAREST(vtiArgs), function: '" + getPrefix() + "', args: '" + replacements + "', instance id: '" + instanceId + "'."); /* * Check whether the URL for the function type is specified. */ String url = ""; try { url = getVTIPropertyWithReplacements(PROP_URL); } catch(Exception e) { logger.logException(GDBMessages.DSWRAPPER_ICAREST_URL_PARAMETER_NOT_SPECIFIED, "An error occurred while attempting to read in the URL to query for this function type. " + "This should be specified by the ICAREST." + getPrefix() + "." + PROP_URL + " parameter in the config file. " + "There is no default value that can be used.", e); } /* * Is caching explicitly disabled? */ cachingExplicitlyDisabled = 0 >= getExpiryDuration(); /* * Get the batch size to fetch from the db and filter rows in. * Use the value of &results= from the query param, if set - else will use default. */ try { String resultsRegex = "\\Q" + RESULTS_QUERY_PARAM + "\\E(\\d+)"; Matcher resultsMatcher = Pattern.compile(resultsRegex).matcher(url); if(resultsMatcher.find()) { int resultsSpecifiedInUrl = Integer.parseInt(resultsMatcher.group(1)); if(resultsSpecifiedInUrl > 0) { fetchBatchSize = resultsSpecifiedInUrl; } } } catch(Exception e) { //Just log - default will be used logger.logException(GDBMessages.DSWRAPPER_ICAREST_INVALID_FETCH_BATCH_SIZE_PARAMETER, "An error occurred while attempting to read in the value for the fetch batch size. " + "This should be specified by the &result query parameter on the url. " + "The default value (" + FETCH_BATCH_SIZE_DEFAULT + ") will be used.", e); } /* * Build and set a custom value for 'extension'. * The extension is 'ICAREST_' + MD5(vtiArgs + result schema + fetchBatchSize). This ensures it is unique for this particular query and * will be the same when it is repeated. It's not just for this instance of ICAREST. * This ensures each different query is segregated into it's own cache, meaning that the results can be shared between * invocations, but avoiding the scenario of an ever growing cache. */ try { byte [] digest = SecurityManager.getChecksumMD5((vtiArgs + getVTIProperty(PROP_SCHEMA) + fetchBatchSize).getBytes()); StringBuilder sb = new StringBuilder(); for (byte b : digest) { sb.append(Integer.toHexString(0x100 + (b & 0xff)).substring(1)); } this.setExtension(this.getClass().getSimpleName() + "_" + sb.toString()); } catch(NoSuchAlgorithmException e) { //MD5 not available } /* * Get the size of the fetch buffer (how many fetch batches to have in memory; subsequent ICAREST requests * will wait until the buffer has more space). * Use the value of ICAREST.fetchbuffersize from the config, if set - else will use default. */ try { String vtiProperty = getVTIProperty(FETCH_BUFFER_SIZE); int fetchBufferSizeFromConfig = Integer.parseInt(vtiProperty); if(fetchBufferSizeFromConfig > 0) { fetchBufferSize = fetchBufferSizeFromConfig; } } catch (NumberFormatException e) { //Just log exception - default will be used logger.logException(GDBMessages.DSWRAPPER_ICAREST_INVALID_FETCH_BUFFER_SIZE_PARAMETER, "An error occurred while attempting to read in the value for the fetch buffer size. " + "This should be specified by the ICAREST." + FETCH_BUFFER_SIZE + " parameter in the config file. " + "The default value (" + FETCH_BUFFER_SIZE_DEFAULT + ") will be used.", e); } catch(Exception e) { //Just log (at low level) - default will be used logger.logDetail("No value has been specified for the ICAREST." + FETCH_BUFFER_SIZE + " parameter in the config file. " + "The default value (" + FETCH_BUFFER_SIZE_DEFAULT + ") will be used."); } // Create bucket to fill with results for derby to fetch from (a fetch buffer) fetchBuffer = new LinkedBlockingDeque<DataValueDescriptor[][]>( fetchBufferSize); //Instantiate the work queue for the buffer populator bufferPopulatorWorkQueue = new LinkedBlockingDeque<String>(); //Get the DVDR[] template for result rows resultRowTemplate = getMetaData().getRowTemplate(); // encode incoming uris (from vtiArgs) for doctext and docbytes functions if ( isDocBytes || isDocText ) { // constructor args contains url arg labels - these need splitting up if ( 1 > replacements.size() ) throw new Exception("Unable to execute ICAREST function '" + getPrefix() + "': Missing URI argument in VTI constructor"); // Don't encode the bits before the actual URI. i.e. "&collection=<collection>&uri=" must be unchanged String arg = replacements.get(0); String uriTag = "&uri="; int uriTagIndex = arg.indexOf( uriTag ); String collection = -1 == uriTagIndex ? "" : arg.substring( 0, uriTagIndex ); String uri = -1 == uriTagIndex ? arg : arg.substring( uriTagIndex + uriTag.length() ); replacements.set(0, collection + uriTag + URLEncoder.encode( uri, Charset.defaultCharset().name() )); } else { // No url arg labels (e.g. '&uri') in the replacements - encode args fully for( int i=0; i<replacements.size(); i++ ) replacements.set(i, URLEncoder.encode( replacements.get(i), Charset.defaultCharset().name() )); } //Load the SQL result filter - if there is one sqlResultFilter = GaianDBConfig.getSQLResultFilter(); //If there's a filter if(sqlResultFilter != null) { policyFilterDefined = true; //If it's a ...FilterX - assign vars appropriately if(sqlResultFilter instanceof SQLResultFilterX) { sqlResultFilterX = (SQLResultFilterX)sqlResultFilter; sqlResultFilter = null; //Also, ask the policy for the max source rows to return maxSourceRows = sqlResultFilterX.setDataSourceWrapper(vtiClassName); } } } public boolean executeAsFastPath() throws StandardException, SQLException { logger.logInfo("Entered executeAsFastPath()"); if(queryRunning) { logger.logImportant("The query is already running - no need to re-execute."); } else { //Kick off the query worker thread new Thread(new Runnable() { @Override public void run() { logger.logInfo("Query worker thread started"); runQuery(); logger.logInfo("Query worker thread ended"); } }, ICA_FETCHER_NAME + " for ICAREST instance " + instanceId).start(); /* * If caching has not been disabled: * Kick off the buffer populator thread. * Else: * We don't need the buffer populator, as there's nowhere to cache ICAREST results. * FetchBuffer population will halt when it's full. */ if(!cachingExplicitlyDisabled) { new Thread(new Runnable() { @Override public void run() { logger.logInfo("Buffer populator thread started"); populateBuffer(); logger.logInfo("Buffer populator thread ended"); } }, BUFFER_POPULATOR_NAME + " for ICAREST instance " + instanceId).start(); } } return true; // never return false - derby calls executeQuery() if you do } public int nextRow(DataValueDescriptor[] dvdr) throws StandardException, SQLException { /* * Loop until we get a valid row (that matches any qualifiers). * Or until end of results (when we return directly). */ boolean gotRow = false; while(!gotRow) { /* * While we don't have a batch from the buffer */ while(currentResultBatch == null || currentResultBatchIndex >= currentResultBatch.length) { try { currentResultBatch = fetchBuffer.takeFirst(); currentResultBatchIndex = 0; /* * If: * - the query is flagged as no longer running * - AND we get an empty batch * - AND the fetch buffer is now empty * Then we've reached the end of results */ // if ( 0 == currentResultBatch.length ) logger.logInfo("====>>>> takeFirst() returned empty batch. queryRunning: " + queryRunning + ", fetchBuffer.size(): " + fetchBuffer.size()); if(!queryRunning && currentResultBatch.length == 0 && fetchBuffer.isEmpty()) { return IFastPath.SCAN_COMPLETED; } } catch (InterruptedException e) { logger.logException( GDBMessages.ENGINE_NEXT_ROW_ERROR, "Caught Exception in nextRow() (returning SCAN_COMPLETED): ", e ); return IFastPath.SCAN_COMPLETED; } } /* * At this point we should have the next non-empty batch. * If qualifiers are set, test whether this row matches: * If so: Copy into dvdr & flag to return * Else: Loop around again looking for another row that is good. */ DataValueDescriptor[] currentResult = currentResultBatch[currentResultBatchIndex]; if(qualifiers == null || RowsFilter.testQualifiers( currentResult, qualifiers )) { System.arraycopy(currentResult, 0, dvdr, 0, currentResult.length); gotRow = true; } currentResultBatchIndex++; } return IFastPath.GOT_ROW; } public int getRowCount() throws Exception { // return doc.getElementsByTagName("es:result").getLength(); return results.getLength(); } // public InputStream getDocStreamResult() { // return docStream; // } public void setQualifiers(VTIEnvironment vtie, Qualifier[][] qual) throws SQLException { //Set the qualifiers for use in nextRow later this.qualifiers = qual; } public boolean isBeforeFirst() { return 0 == currentRow; } //SDN - Not sure if required - 04/12 // public class MyResetableStream extends InputStream implements Resetable { // // private InputStream is = null; // // private int[] cachedBytes = new int[1000]; // private int numBytesCached = 0, pos = 0; // // public MyResetableStream(InputStream is) { // super(); // this.is = is; // logger.logInfo("Entered MyResetableStream(); Mark isSupported ? " + is.markSupported()); //// if ( is.markSupported() ) is.mark(1000); // } // // public void closeStream() { // logger.logInfo("Entered closeStream()"); // try { // is.close(); // } catch (IOException e) { // logger.logException(GDBMessages.DSWRAPPER_STREAM_CLOSE_ERROR_IO, "Unable to close Stream", e); // } // } // // public void initStream() throws StandardException { // logger.logInfo("Entered initStream()"); // numBytesCached = 0; pos = 0; // } // // public void resetStream() throws IOException, StandardException { // logger.logInfo("Entered resetStream(), numBytesCached = " + numBytesCached); //// if ( is.markSupported() ) is.reset(); // pos = 0; // } // // @Override // public int read() throws IOException { // logger.logInfo("Entered read()"); // if ( pos < numBytesCached ) return cachedBytes[pos++]; // int b = is.read(); // if ( numBytesCached < cachedBytes.length ) cachedBytes[numBytesCached++] = b; // return b; // } // } public double getEstimatedCostPerInstantiation(VTIEnvironment arg0) throws SQLException { return 0; } public double getEstimatedRowCount(VTIEnvironment arg0) throws SQLException { return 1; // encourage Derby to treat this VTI as the inner table in joins } public boolean supportsMultipleInstantiations(VTIEnvironment arg0) throws SQLException { return false; } @Override public void close() throws SQLException { super.close(); markCacheInUse(false); cleanUpCaches(); } private void runQuery() { queryRunning = true; try { // Get the given VTI property, and substitute any arguments $0, $1, $2,.. with values passed in to the VTI call. //urlString = escapePercentSymbolsAndSpaces( getVTIPropertyWithReplacements( PROP_URL ) ); urlString = getVTIPropertyWithReplacements(PROP_URL); // For doctext and docbytes, URI is now encoded in the constructor } catch (Exception e) { logger.logImportant("Ignoring ICAREST '"+getPrefix()+"' query: " + e.getMessage()); //Notify the buffer populator that we've finished querying ICAREST //by putting the END_OF_RESULTS_IDENTIFIER to the buffer populator work queue synchronized (bufferPopulatorWorkQueue) { /* * Loop trying to put to the queue - this is to avoid us stopping due to InterruptedException. * WARNING: This 'could' potentially loop forever. */ boolean offerSuccess = bufferPopulatorWorkQueue.offerLast(END_OF_RESULTS_IDENTIFIER); while(!offerSuccess) { try { offerSuccess = bufferPopulatorWorkQueue.offerLast(END_OF_RESULTS_IDENTIFIER, 100, TimeUnit.MILLISECONDS); } catch (InterruptedException e1) { //Don't care } } } queryRunning = false; //Return - as no point trying to do any work return; } //Mark cache as logically in use markCacheInUse(true); cacheErrors.clear(); fieldModificationCountMap.clear(); Lock cacheModifyLock = null; if(!cachingExplicitlyDisabled) { cacheModifyLock = getCacheModifyLock(); /* * Attempt to get the cache modify lock for this query. * * Note: it doesn't matter who gets here first. The first * guy will do the heavy lifting and subsequent folks will * wait on the lock and then read from the cache when free. */ cacheModifyLock.lock(); } try{ newStartIndex = 0; currentRow = 0; //Flag of whether any data was cached as part of this query boolean dataCached = false; while(queryRunning) { /* * Build the urlToUse for this part of the query * NOTE: we do this here so that the cache check works correctly. */ String urlToUse = urlString; //If doing COUNT //Only need to request 0 results - as can use totalResults value from return if (isCount) { int indexOfResultsParam = urlToUse.indexOf(RESULTS_QUERY_PARAM); if(indexOfResultsParam == -1) { urlToUse = urlToUse + RESULTS_QUERY_PARAM + 0; } else { urlToUse = urlToUse.replaceAll("\\Q" + RESULTS_QUERY_PARAM + "\\E\\d*", RESULTS_QUERY_PARAM + 0); } } //If doing SEARCH //Need to add batch size and start index else if(isSearch) { /* * Add rowBatchSize to query as &results= (overwriting any existing value). */ int indexOfResultsParam = urlToUse.indexOf(RESULTS_QUERY_PARAM); if(indexOfResultsParam == -1) { urlToUse = urlToUse + RESULTS_QUERY_PARAM + fetchBatchSize; } else { urlToUse = urlToUse.replaceAll("\\Q" + RESULTS_QUERY_PARAM + "\\E\\d*", RESULTS_QUERY_PARAM + fetchBatchSize); } /* * Add startIndex to query as &start= (overwriting any existing value). */ int indexOfStartParam = urlToUse.indexOf(START_QUERY_PARAM); if(indexOfStartParam == -1) { urlToUse = urlToUse + START_QUERY_PARAM + newStartIndex; } else { urlToUse = urlToUse.replaceAll("\\Q" + START_QUERY_PARAM + "\\E\\d*", START_QUERY_PARAM + newStartIndex); } } /* * Create a reusable array to fill with batches of results to work with. */ DataValueDescriptor[][] resultBatch = new DataValueDescriptor[fetchBatchSize][]; for (int i=0; i < resultBatch.length; i++) { //Create a new 'row' DataValueDescriptor[] nextRow = new DataValueDescriptor[resultRowTemplate.length]; //Fill the new row with empty copies of every DataValueDescriptor type in the rowTemplate for ( int j=0; j < resultRowTemplate.length; j++ ) { nextRow[j] = resultRowTemplate[j].getNewNull(); } //Place the new holder row into the result batch resultBatch[i] = nextRow; } /* * Get the results. */ int resultsInThisBatchBeforeFiltering = 0; try { if ( isCached( "CACHEID="+urlToUse.hashCode() ) ) { logger.logImportant("Data is cached - no need to run ICAREST query"); // While more results from cache && we've not hit the batch limit while( resultsInThisBatchBeforeFiltering < fetchBatchSize && (nextRowFromCache(resultBatch[resultsInThisBatchBeforeFiltering])) != SCAN_COMPLETED) { resultsInThisBatchBeforeFiltering++; currentRow++; } //If search, then need to inflate start index - as if we were doing a real query if(isSearch) { newStartIndex = newStartIndex + resultsInThisBatchBeforeFiltering; } } else { if(isSearch) { //If max source rows hit - don't bother to query //Note: there is also a check further down - as we may hit the limit in the middle of a batch of results if(-1 < maxSourceRows && currentRow >= maxSourceRows) { logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_PARTIAL_RESULT, "The raw ICAREST Query has been restricted to a maximum of " + maxSourceRows + " results. (search top)"); } else { logger.logImportant("Opening URL: " + urlToUse); URLConnection urlc = new URL(urlToUse).openConnection(); urlc.setRequestProperty("Accept-encoding", "gzip,deflate"); String usr = getVTIPropertyNullable("username"); String pwd = getVTIPropertyNullable("password"); if ( null != usr && null != pwd ) { // Use this code for Authentication? (NOT TESTED YET) String encoded = new String( new BASE64Encoder().encode(new String( usr + ':' + pwd ).getBytes()) ); urlc.setRequestProperty("Proxy-Authorization", "Basic " + encoded); } else logger.logInfo("Unable to find username and password properties for ICAREST. No authentication parms will be passed"); Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(urlc.getInputStream()); itemsPerPage = Integer.parseInt( doc.getElementsByTagName("es:itemsPerPage").item(0).getTextContent() ); totalResults = Integer.parseInt( doc.getElementsByTagName("es:totalResults").item(0).getTextContent() ); if ( 1 > itemsPerPage ) { logger.logImportant("Retrieved " + itemsPerPage + " search results, totalResults = " + totalResults); } //Note: startIndex may not be present - if we've hit the end of the results NodeList startIndexElements = doc.getElementsByTagName("es:startIndex"); if(startIndexElements.getLength() > 0) { startIndex = Integer.parseInt( doc.getElementsByTagName("es:startIndex").item(0).getTextContent() ); // inclusive newStartIndex = startIndex + itemsPerPage; } //Else we've hit the end of the results - no need to increment startIndex results = doc.getElementsByTagName("es:result"); logger.logImportant("Retrieved " + itemsPerPage + " search results, records " + (startIndex+1) + "-" + (startIndex+itemsPerPage) + " of " + totalResults + " results."); // While more results // AND we've not hit the batch limit while(resultsInThisBatchBeforeFiltering < fetchBatchSize && resultsInThisBatchBeforeFiltering < results.getLength()) { // If we've hit the max source rows limit - warn and stop looping // Note: there's also a check above to rule out queries if we can if(-1 < maxSourceRows && currentRow >= maxSourceRows) { logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_PARTIAL_RESULT, "The raw ICAREST Query has been restricted to a maximum of " + maxSourceRows + " results. (search mid)"); break; } else { Node item = results.item(resultsInThisBatchBeforeFiltering); NodeList iel = item.getChildNodes(); //Set CACHEID resultBatch[resultsInThisBatchBeforeFiltering][5].setValue( urlToUse.hashCode() ); int numFieldsFound = 0; boolean isFoundCollectionName = false; String icaURI = null, collectionName = null; for (int k = 0; k < iel.getLength() && 5 > numFieldsFound; k++) { if (iel.item(k).getNodeType() == Element.ELEMENT_NODE) { Element child = (Element) iel.item(k); String tagName = child.getTagName(); if ( tagName.equals("es:title") ) resultBatch[resultsInThisBatchBeforeFiltering][0].setValue(child.getTextContent()); else if ( tagName.equals("es:relevance") ) resultBatch[resultsInThisBatchBeforeFiltering][3].setValue(child.getTextContent()); else if ( tagName.equals("es:summary") ) resultBatch[resultsInThisBatchBeforeFiltering][4].setValue(child.getTextContent()); else if ( tagName.equals("es:id") ) icaURI = child.getTextContent(); else if ( isFoundCollectionName ) continue; else { // Search for collection name String id = tagName.equals("es:link") && child.getAttribute("rel").equals("alternate") ? child.getAttribute("href") : tagName.equals("es:thumbnail") ? child.getAttribute("href") : tagName.equals("es:link") && child.getAttribute("rel").equals("via") ? child.getAttribute("href") : null; if ( null == id ) continue; int idx = id.indexOf('?'); if ( -1 == idx ) { logger.logWarning(GDBMessages.DSWRAPPER_DOC_URI_NOT_FOUND, "Unable to find doc URI in an href link of search results, tag: " + tagName + ", href: " + id); continue; } String urlArgs = id.substring(idx+1); String uriTag = "&uri="; int idxURI = urlArgs.indexOf(uriTag); if ( -1 == idxURI ) { logger.logWarning(GDBMessages.DSWRAPPER_URI_ARG_NOT_FOUND, "Unable to find 'uri' argument in href link, tag: " + tagName + ", href: " + id); continue; } String collectionTag = "collection="; collectionName = urlArgs.startsWith(collectionTag) ? urlArgs.substring(collectionTag.length(), idxURI) : null; if ( null == collectionName ) { // 'collection' is not the first argument collectionTag = '?' + collectionTag; int idxCol = urlArgs.indexOf(collectionTag); if ( -1 == idxCol ) { logger.logWarning(GDBMessages.DSWRAPPER_COLLECTION_ARG_NOT_FOUND, "Unable to find 'collection' argument in href link, tag: " + tagName + ", href: " + id); continue; } collectionName = urlArgs.substring( idxCol + collectionTag.length(), idxURI ); } isFoundCollectionName = true; } numFieldsFound++; // increment numFields found only if we found a new field (otherwise 'continue;' will have been called) } // if the child is an element node } // for all children resultBatch[resultsInThisBatchBeforeFiltering][1].setValue( collectionName + "&uri=" + icaURI); // return this as URI for now - later the collection and uri should be separated resultBatch[resultsInThisBatchBeforeFiltering][2].setValue( icaURI.hashCode() ); //Validate and modify the result row so it matches the resultset schema validateAndModifyToSchema(resultBatch[resultsInThisBatchBeforeFiltering], fieldModificationCountMap); cacheRow(resultBatch[resultsInThisBatchBeforeFiltering], cacheErrors); dataCached = true; resultsInThisBatchBeforeFiltering++; currentRow++; } } } } else if (isCount) { //Note: urlString will have been modified before cache check logger.logImportant("Opening URL: " + urlToUse); URLConnection urlc = new URL(urlToUse).openConnection(); urlc.setRequestProperty("Accept-encoding", "gzip,deflate"); Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(urlc.getInputStream()); int count = Integer.parseInt( doc.getElementsByTagName("es:totalResults").item(0).getTextContent() ); resultBatch[resultsInThisBatchBeforeFiltering][0].setValue(count); resultBatch[resultsInThisBatchBeforeFiltering][1].setValue(urlToUse.hashCode()); //Validate and modify the result row so it matches the resultset schema validateAndModifyToSchema(resultBatch[resultsInThisBatchBeforeFiltering], fieldModificationCountMap); cacheRow(resultBatch[resultsInThisBatchBeforeFiltering], cacheErrors); dataCached = true; resultsInThisBatchBeforeFiltering++; currentRow++; } else if (isDocText) { if(maxSourceRows == 0) { logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_PARTIAL_RESULT, "The raw ICAREST Query has been restricted to a maximum of " + maxSourceRows + " results. (doctext)"); } else { logger.logImportant("Opening URL: " + urlToUse); URLConnection urlc = new URL(urlToUse).openConnection(); urlc.setRequestProperty("Accept-encoding", "gzip,deflate"); final char[] buf = new char[0x10000]; StringBuilder out = new StringBuilder(); Reader in = new InputStreamReader(urlc.getInputStream()); int numBytes; while ( (numBytes = in.read(buf, 0, buf.length)) > 0 ) { out.append(buf, 0, numBytes); } docText = out.toString(); logger.logImportant("Retrieved doctext, num chars: " + docText.length()); resultBatch[resultsInThisBatchBeforeFiltering][0].setValue( docText.length() ); resultBatch[resultsInThisBatchBeforeFiltering][1].setValue( docText ); resultBatch[resultsInThisBatchBeforeFiltering][2].setValue( urlToUse.hashCode() ); //Validate and modify the result row so it matches the resultset schema validateAndModifyToSchema(resultBatch[resultsInThisBatchBeforeFiltering], fieldModificationCountMap); cacheRow(resultBatch[resultsInThisBatchBeforeFiltering], cacheErrors); dataCached = true; resultsInThisBatchBeforeFiltering++; currentRow++; } } else if (isDocBytes) { if(maxSourceRows == 0) { logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_PARTIAL_RESULT, "The raw ICAREST Query has been restricted to a maximum of " + maxSourceRows + " results. (docbytes)"); } else { logger.logImportant("Opening URL: " + urlToUse); URLConnection urlc = new URL(urlToUse).openConnection(); urlc.setRequestProperty("Accept-encoding", "gzip,deflate"); docName = null; String fInfo = urlc.getHeaderField("Content-Disposition"); if ( null != fInfo ) { // we have a file name String fTag = "filename="; int idx = fInfo.indexOf(fTag); if ( -1 != idx ) { docName = fInfo.substring(idx+fTag.length()+1, fInfo.length()-1); logger.logInfo("Content-Disposition filename: " + docName); } } docType = urlc.getContentType(); logger.logInfo("Content-type: " + docType); logger.logInfo("Content-encoding: " + urlc.getContentEncoding()); ByteArrayOutputStream baos = new ByteArrayOutputStream(); InputStream is = urlc.getInputStream(); Util.copyBinaryData(is, new GZIPOutputStream(baos)); docBytes = baos.toByteArray(); baos.close(); // other streams are closed logger.logImportant("Retrieved and zipped doc bytes from ICA InputStream, numbytes = " + docBytes.length); logger.logInfo("Setting document docbytes row, numbytes " + docBytes.length); resultBatch[resultsInThisBatchBeforeFiltering][0].setValue( docName ); resultBatch[resultsInThisBatchBeforeFiltering][1].setValue( docType ); resultBatch[resultsInThisBatchBeforeFiltering][2].setValue( docBytes.length ); resultBatch[resultsInThisBatchBeforeFiltering][3].setValue( docBytes ); resultBatch[resultsInThisBatchBeforeFiltering][4].setValue( urlToUse.hashCode() ); //Validate and modify the result row so it matches the resultset schema validateAndModifyToSchema(resultBatch[resultsInThisBatchBeforeFiltering], fieldModificationCountMap); cacheRow(resultBatch[resultsInThisBatchBeforeFiltering], cacheErrors); dataCached = true; resultsInThisBatchBeforeFiltering++; currentRow++; } } } } catch (Exception e) { logger.logException(GDBMessages.DSWRAPPER_ICAREST_ROW_FETCH_ERROR, "Unable to fetch row: ", e); } //Don't bother filtering if no results if(resultsInThisBatchBeforeFiltering != 0) { //If no caching // put to buffer synchronously // AND do filtering now if(cachingExplicitlyDisabled) { //If not a full batch - reduce the batch size to pass to the filter //Note: this should only happen at the tail end of the query - so no need to worry about re-expanding if(resultsInThisBatchBeforeFiltering < fetchBatchSize) { //Create temp reduced batch DataValueDescriptor[][] reducedBatch = new DataValueDescriptor[resultsInThisBatchBeforeFiltering][]; //Copy just the filled rows into the reduced batch System.arraycopy(resultBatch, 0, reducedBatch, 0, resultsInThisBatchBeforeFiltering); //Re-assign the resultBatch to the reduced version resultBatch = reducedBatch; logger.logDetail("Batched Filtering: Reduced final filtering batch to size: " + resultBatch.length); } /* * Filter the batch. */ DataValueDescriptor[][] rb = filterBatch(resultBatch); if ( null != rb ) resultBatch = rb; boolean offerSuccess = fetchBuffer.offerLast(resultBatch); while(!offerSuccess) { try { offerSuccess = fetchBuffer.offerLast(resultBatch, 100, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //Don't care } } } else { /* * Notify the bufferPopulator it has work to do. * Note: the buffer populator will do the filtering. */ synchronized (bufferPopulatorWorkQueue) { String toOffer = "" + urlToUse.hashCode(); boolean offerSuccess = bufferPopulatorWorkQueue.offerLast(toOffer); while(!offerSuccess) { try { offerSuccess = bufferPopulatorWorkQueue.offerLast(toOffer, 100, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //Don't care } } } } } //If not a SEARCH (as all other queries should be complete by now) //OR (if search) If the batch was not filled (before filtering) //then we've hit the end of the results - query finished if(!isSearch || resultsInThisBatchBeforeFiltering < fetchBatchSize) { queryRunning = false; } } /* * Warn if any of the data was modified to match the schema */ if(!fieldModificationCountMap.isEmpty()) { Set<Entry<String,Object[]>> entrySet = fieldModificationCountMap.entrySet(); for (Entry<String, Object[]> entry : entrySet) { logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_DATA_TRUNCATION_OCCURRED, "Truncation of field '" + entry.getKey() + "' to '" + entry.getValue()[0] + "' was performed; for ICAREST " + getPrefix() + " query. Occurences: " + entry.getValue()[1]); } } //If no caching - put end of results indicator batch to fetchbuffer if(cachingExplicitlyDisabled){ putEndOfResultsIndicatorBatch(); } else { /* * If any result caching errors occurred - log warnings and invalidate the cache. */ if(!cacheErrors.isEmpty()) { //Log warning StringBuffer cacheErrorMessage = new StringBuffer(); cacheErrorMessage.append("The following SQL errors occured while trying to cache the results (SQLState - Count): "); Set<Entry<String,Integer>> entrySet = cacheErrors.entrySet(); for (Entry<String, Integer> entry : entrySet) { cacheErrorMessage.append(entry.getKey()); cacheErrorMessage.append(" - "); cacheErrorMessage.append(entry.getValue()); cacheErrorMessage.append(','); //Remove trailing comma cacheErrorMessage.setLength(cacheErrorMessage.length() - 1); } cacheErrorMessage.append("; for ICAREST "); cacheErrorMessage.append(getPrefix()); cacheErrorMessage.append(" query."); logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_CACHE_ROWS_ERROR, cacheErrorMessage.toString()); //Also, invalidate the cache invalidateCache(); } /* * Else caching completed successfully * If data was cached - and we didn't just read from the cache - update cache expiry values. */ else if(dataCached) { try { resetCacheExpiryTime(); } catch (SQLException e) { logger.logException(GDBMessages.DSWRAPPER_ICAREST_EXECUTE_ERROR, "Unable to reset cache expiration time ", e); } } //Notify the buffer populator that we've finished querying ICAREST //by putting the END_OF_RESULTS_IDENTIFIER to the buffer populator work queue synchronized (bufferPopulatorWorkQueue) { /* * Loop trying to put to the queue - this is to avoid us stopping due to InterruptedException. * WARNING: This 'could' potentially loop forever. */ boolean offerSuccess = bufferPopulatorWorkQueue.offerLast(END_OF_RESULTS_IDENTIFIER); while(!offerSuccess) { try { offerSuccess = bufferPopulatorWorkQueue.offerLast(END_OF_RESULTS_IDENTIFIER, 100, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //Don't care } } } } } finally { if(!cachingExplicitlyDisabled) { /* * Whatever happens, exception or clean flow, we've finished with the lock now. * Release it so that others can use it. * Otherwise it will remained locked forever I think. */ cacheModifyLock.unlock(); } } } private void populateBuffer() { /* * Loop 'forever'. * Note: To end the thread, we return out of this method further down. */ while(true) { /* * Peek for stuff from the work queue, waiting until we get something. * * Note: we do the peek here, and the remove at the end; so that if a * problem occurs in this iteration: * - we leave it on the queue for next time (Could this cause infinite loops?) * - The buffer doesn't become prematurely early (as this is used part of the test * of whether to put directly to the fetchBuffer by runQuery) */ String fromWorkQueue; synchronized (bufferPopulatorWorkQueue) { fromWorkQueue = bufferPopulatorWorkQueue.peekFirst(); } while(fromWorkQueue == null) { synchronized (bufferPopulatorWorkQueue) { fromWorkQueue = bufferPopulatorWorkQueue.peekFirst(); } //Sleep so that someone else has a chance to synchronize before we loop again try { Thread.sleep(10); } catch (InterruptedException e) { //Don't care } } /* * If we've hit the end of the results. */ if(fromWorkQueue.equals(END_OF_RESULTS_IDENTIFIER)) { putEndOfResultsIndicatorBatch(); //Return - to shutdown this thread return; } /* * Else something we need to populate into the fetchBuffer. */ else { Statement stmt = null; Connection c = null; GaianChildRSWrapper rsWrapper = null; try { c = getPooledLocalDerbyConnection(); stmt = c.createStatement(); /* * Query the cache for the batch. */ ArrayList<DataValueDescriptor []> resultBatchList = new ArrayList<DataValueDescriptor[]>(); ResultSet rs = stmt.executeQuery("SELECT " + getTableMetaData().getColumnNames() + " FROM " + getCacheSchemaAndTableName() + " WHERE CACHEID = " + fromWorkQueue); rsWrapper = new GaianChildRSWrapper(rs); DataValueDescriptor[] nextRow; //Create a new 'row' nextRow = new DataValueDescriptor[resultRowTemplate.length]; //Fill the new row with empty copies of every DataValueDescriptor type in the rowTemplate for ( int j=0; j < resultRowTemplate.length; j++ ) { nextRow[j] = resultRowTemplate[j].getNewNull(); } //While more rows from 'cache' while(rsWrapper.fetchNextRow(nextRow)) { //Add to batch resultBatchList.add(nextRow); /* * Create a new 'row' * AND * Fill the new row with empty copies of every DataValueDescriptor type in the rowTemplate * * Note: as we don't know how many we are likely to need, we do it one by one. I don't think this * is any more inefficient that making a batch to begin with. */ nextRow = new DataValueDescriptor[resultRowTemplate.length]; for ( int j=0; j < resultRowTemplate.length; j++ ) { nextRow[j] = resultRowTemplate[j].getNewNull(); } } //If no results - then something odd has happened if(resultBatchList.isEmpty()) { logger.logWarning(GDBMessages.DSWRAPPER_ICAREST_CACHE_READ_EMPTY, "A read request from " + fromWorkQueue + " returned no results when results were expected."); } else { DataValueDescriptor[][] resultBatch = resultBatchList.toArray(new DataValueDescriptor[][]{}); /* * Filter the data. */ DataValueDescriptor[][] rb = filterBatch(resultBatch); if ( null != rb ) resultBatch = rb; /* * Put result batch onto fetchBuffer for derby to consume (through nextRow()). * * Use looping offer, as this must happy. */ boolean offerSuccess = fetchBuffer.offerLast(resultBatch); while(!offerSuccess) { try { offerSuccess = fetchBuffer.offerLast(resultBatch, 100, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //Don't care } } } /* * If we've got here without exception, then everything was done successfully. * Now remove the element we peeked at from the front of the queue. */ synchronized (bufferPopulatorWorkQueue) { fromWorkQueue = bufferPopulatorWorkQueue.pollFirst(); } while(fromWorkQueue == null) { synchronized (bufferPopulatorWorkQueue) { fromWorkQueue = bufferPopulatorWorkQueue.pollFirst(); } //Sleep so that someone else has a chance to synchronize before we loop again try { Thread.sleep(10); } catch (InterruptedException e) { //Don't care } } } catch (SQLSyntaxErrorException e) { // SQLSyntaxErrorException typically occurs if the cache table doesn't exist. We may have run out of disk space. Need to abort buffer population. logger.logException(GDBMessages.DSWRAPPER_ICAREST_CACHE_READ_ERROR, "Unable to fetch from the cache table. SQLSyntaxErrorException - Aborting rows buffer population.", e); putEndOfResultsIndicatorBatch(); return; } catch (SQLException e) { logger.logException(GDBMessages.DSWRAPPER_ICAREST_CACHE_READ_ERROR, "Unable to fetch from the cache table. Warning: This may result in a partial result.", e); } finally { try { logger.logInfo("Closing select stmt isNull? " + (null==stmt) + ", and recycling its connection isActive? " + (null != c && !c.isClosed()) ); if ( null != stmt ) stmt.close(); if ( null != c && !c.isClosed() ) recyclePooledLocalDerbyConnection(c); if ( null != rsWrapper ) rsWrapper.close(); } catch ( SQLException e ) { logger.logWarning(GDBMessages.DSWRAPPER_RECYCLE_CONNECTION_ERROR, "Unable to recycle connection after reading from cache table"); } } } } } /** * TODO * @param resultBatch */ private DataValueDescriptor[][] filterBatch(DataValueDescriptor[][] resultBatch) { if(policyFilterDefined) { //If batch filtering available if(sqlResultFilterX != null) { //Note: use vtiArgs (the args passed into the VTI) as the datasourceid resultBatch = sqlResultFilterX.filterRowsBatch(this.vtiArgs, resultBatch); } //Else if single filtering available else if(sqlResultFilter != null) { //Create temp batch representing the records the user is allowed - this has max size resultBatch.length //Note: records are only added to this (and hence the index is only incremented) when a user is allowed to see them DataValueDescriptor[][] allowedBatch = new DataValueDescriptor[resultBatch.length][]; int allowedBatchIndex = 0; for(int i = 0; i < resultBatch.length; i++) { if(sqlResultFilter.filterRow(resultBatch[i])) { allowedBatch[allowedBatchIndex] = resultBatch[i]; allowedBatchIndex++; } } //Make resultBatch (which gets reported) a reduced copy of the allowed batch resultBatch = Arrays.copyOf(allowedBatch, allowedBatchIndex); } //Else no filtering - and kind of error - as policyFilterDefined should not be true } //Else no filtering return resultBatch; } /** * TODO */ private void putEndOfResultsIndicatorBatch() { /* * Put an empty batch on the end of the fetchBuffer to indicate end of results * - in case nextRow is still blocking on take. * (This works in conjunction with queryRunning being false at this point - see nextRow()) * * Use looping offer, as this must happen. */ DataValueDescriptor[][] emptyBatch = new DataValueDescriptor[0][]; boolean offerSuccess = fetchBuffer.offerLast(emptyBatch); while(!offerSuccess) { try { offerSuccess = fetchBuffer.offerLast(emptyBatch, 100, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //Don't care } } } }