KernelRunner.java example

Explorer

aparapi-clone-master
- com.amd.aparapi
  - src
    - java
      - com
        amd
        aparapi
        Config.java
        EXECUTION_MODE.java
        Kernel.java
        KernelMapping.java
        ProfileInfo.java
        Range.java
        annotation
        Constant.java
        Experimental.java
        Local.java
        OpenCLDelegate.java
        OpenCLMapping.java
        package-info.java
        device
        Device.java
        JavaDevice.java
        OpenCLDevice.java
        package-info.java
        exception
        DeprecatedException.java
        package-info.java
        internal
        annotation
        DocMe.java
        RemoveMe.java
        Unused.java
        UsedByJNICode.java
        exception
        AparapiException.java
        ClassParseException.java
        CodeGenException.java
        RangeException.java
        instruction
        BranchSet.java
        ExpressionList.java
        Instruction.java
        InstructionPattern.java
        InstructionSet.java
        InstructionTransformer.java
        jni
        ConfigJNI.java
        KernelArgJNI.java
        KernelRunnerJNI.java
        OpenCLJNI.java
        RangeJNI.java
        kernel
        KernelArg.java
        KernelRunner.java
        model
        ClassModel.java
        Entrypoint.java
        MethodModel.java
        opencl
        OpenCLArgDescriptor.java
        OpenCLKernel.java
        OpenCLLoader.java
        OpenCLMem.java
        OpenCLPlatform.java
        OpenCLProgram.java
        reader
        ByteBuffer.java
        ByteReader.java
        tool
        InstructionHelper.java
        InstructionViewer.java
        package-info.java
        util
        OpenCLUtil.java
        UnsafeWrapper.java
        writer
        BlockWriter.java
        KernelWriter.java
        opencl
        OpenCL.java
        OpenCLAdapter.java
        package-info.java
        package-info.java
- examples
  - effects
    - src
      - com
        amd
        aparapi
        examples
        effects
        Main.java
  - javaonedemo
    - src
      - com
        amd
        aparapi
        examples
        javaonedemo
        Life.java
        Mandel.java
        NBody.java
  - movie
    - src
      - com
        amd
        aparapi
        examples
        movie
        AparapiSolution.java
        ConvMatrix3x3Editor.java
        JJMPEGPlayer.java
        PureJavaSolution.java
        ReferenceSolution.java
  - nbody
    - src
      - com
        amd
        aparapi
        examples
        nbody
        Local.java
        Main.java
        Seq.java
  - oopnbody
    - src
      - com
        amd
        aparapi
        examples
        oopnbody
        Body.java
        Main.java
- samples
  - add
    - src
      - com
        amd
        aparapi
        sample
        add
        Main.java
  - blackscholes
    - src
      - com
        amd
        aparapi
        samples
        blackscholes
        Main.java
  - convolution
    - src
      - com
        amd
        aparapi
        sample
        convolution
        ConvMatrix3x3Editor.java
        Convolution.java
        ConvolutionOpenCL.java
        ConvolutionViewer.java
        PureJava.java
  - extension
    - src
      - com
        amd
        aparapi
        sample
        extension
        FFTExample.java
        Histogram.java
        HistogramIdeal.java
        MandelExample.java
        MandelSimple.java
        Pow4Example.java
        SquareExample.java
        StopWatch.java
        SwapExample.java
  - info
    - src
      - com
        amd
        aparapi
        sample
        info
        Main.java
  - life
    - src
      - com
        amd
        aparapi
        sample
        life
        Main.java
  - mandel
    - src
      - com
        amd
        aparapi
        sample
        mandel
        Main.java
        Main2D.java
  - mdarray
    - src
      - gov
        pnnl
        aparapi
        sample
        mdarray
        BMatMul1D.java
        BMatMul2D.java
        BMatMul3D.java
        DMatMul1D.java
        DMatMul2D.java
        DMatMul3D.java
        FMatMul1D.java
        FMatMul2D.java
        FMatMul3D.java
        IMatMul1D.java
        IMatMul2D.java
        IMatMul3D.java
        LMatMul1D.java
        LMatMul2D.java
        LMatMul3D.java
        MDArray.java
        SMatMul1D.java
        SMatMul2D.java
        SMatMul3D.java
        ZMatMul1D.java
        ZMatMul2D.java
        ZMatMul3D.java
  - squares
    - src
      - com
        amd
        aparapi
        sample
        squares
        Main.java
- test
  - codegen
    - src
      - java
        com
        amd
        aparapi
        CodeGenJUnitBase.java
        CreateJUnitTests.java
        Diff.java
        KernelHelper.java
        Source.java
        SwingDiff.java
        test
        Access2DIntArray.java
        AccessBooleanArray.java
        AccessByteArray.java
        AccessDoubleArray.java
        AccessFloatArray.java
        AccessIntArray.java
        AccessLongArray.java
        AccessNested2DIntArray.java
        AccessShortArray.java
        AndOrAndPrecedence.java
        AndOrPrecedence.java
        AndOrPrecedence2.java
        ArbitraryScope.java
        ArbitraryScope2.java
        ArbitraryScopeSimple.java
        ArrayTortureIssue35.java
        Assign2DIntArray.java
        AssignAndPassAsParameter.java
        AssignAndPassAsParameterSimple.java
        AssignField.java
        Atomic32Pragma.java
        BooleanToggle.java
        Break.java
        ByteParams.java
        ByteParamsSimple.java
        CallGetPassId.java
        CallObject.java
        CallObjectStatic.java
        CallRunSuper.java
        CallStaticInAnotherClass.java
        CallSuper.java
        CallTwice.java
        CharArrayField.java
        CharAsParameter.java
        CharType.java
        ClassHasStaticFieldAccess.java
        ClassHasStaticMethod.java
        ClassHasStaticMethodSimple.java
        CompositeArbitraryScope.java
        ConstantAssignInExpression.java
        Continue.java
        ContinueTorture.java
        DirectRecursion.java
        DoWhile.java
        Drem.java
        EarlyReturn.java
        EmptyWhileWithInc.java
        EntrypointRecursion.java
        Ex.java
        FirstAssignInExpression.java
        FirstAssignInExpression2.java
        FloatParams.java
        FloatParamsSimple.java
        For.java
        ForAnd.java
        ForAndMandel.java
        ForAndMandelNoInitialize.java
        ForAsFirst.java
        ForBooleanToggle.java
        ForBreak.java
        ForEach.java
        ForIf.java
        ForIfMandel.java
        Frem.java
        IEEERemainderDouble.java
        IEEERemainderFloat.java
        If.java
        IfAnd.java
        IfAndAnd.java
        IfAndAndAnd.java
        IfAndOrAnd.java
        IfBooleanAndAndAnd.java
        IfBooleanAndAndOr.java
        IfBooleanAndOrAnd.java
        IfBooleanAndOrOr.java
        IfBooleanOrAndAnd.java
        IfBooleanOrAndOr.java
        IfBooleanOrOrAnd.java
        IfBooleanOrOrOr.java
        IfElse.java
        IfElseAnd.java
        IfElseAndAndAnd.java
        IfElseIfElseIfElse.java
        IfElseNot__OrOr_And_.java
        IfElseOrOrAnd.java
        IfElseOrOrOr.java
        IfElse_And_Or_And.java
        IfElse_OrOr_And.java
        IfElse_Or_And_Or.java
        IfOr.java
        IfOrAndOr.java
        IfOrOr.java
        IfOrOrAnd.java
        IfOrOrOr.java
        If_IfElseIfElseElse_Else.java
        If_IfElse_Else.java
        If_IfElse_Else_IfElse_.java
        If_If_Else.java
        If_If_Else2.java
        If_If_Else_If_.java
        If_OrOr_And.java
        If_While_Else.java
        If_While_Else_While.java
        ImplementsInterface.java
        IncArrayArgContent.java
        IncField.java
        IndirectRecursion.java
        Interface.java
        LongCompare.java
        LongCompares.java
        Loops.java
        MathAbs.java
        MathDegRad.java
        MathFallThru.java
        MathMax.java
        MathMin.java
        MathRemainder.java
        MultiContinue.java
        MultipleAssign.java
        MultipleAssignExpr.java
        NaN.java
        NewLocalArray.java
        NonNullCheck.java
        NullCheck.java
        ObjectArrayCallHierarchy.java
        ObjectArrayCommonSuper.java
        ObjectArrayMemberAccess.java
        ObjectArrayMemberBadGetter.java
        ObjectArrayMemberBadSetter.java
        ObjectArrayMemberCall.java
        ObjectArrayMemberGetterSetter.java
        ObjectArrayMemberHierarchy.java
        ObjectArrayMemberNotFinal.java
        ObjectRefCopy.java
        ObjectWithinObject.java
        OrAndOrPrecedence.java
        OverloadMethod.java
        OverriddenKernelField.java
        PlayPen.java
        PostDecArrayItem.java
        PostDecByte.java
        PostDecLocal.java
        PostDecPostInc.java
        PostIncArrayIndexAndElement.java
        PostIncArrayItem.java
        PostIncArrayItemAsParameter.java
        PostIncArrayItemFieldIndex.java
        PostIncByte.java
        PostIncByteField.java
        PostIncInt.java
        PostIncIntField.java
        PostIncLocal.java
        PostIncLocalStandalone.java
        PostIncLocalTwice.java
        PreDecArrayIndexAndElement.java
        PreDecArrayItem.java
        PreDecPostInc.java
        PreIncArrayIndexAndElement.java
        PreIncArrayItem.java
        PreIncByte.java
        PreIncByteField.java
        PreIncInt.java
        PreIncIntField.java
        PreIncLocal.java
        PreIncLocalStandalone.java
        PreIncLocalTwice.java
        ReturnBooleanNewArray.java
        ReturnBooleanVarArray.java
        ReturnByteArrayNew.java
        ReturnByteArrayVar.java
        ReturnDoubleArrayNew.java
        ReturnDoubleArrayVar.java
        ReturnFloatArrayNew.java
        ReturnFloatArrayVar.java
        ReturnIntArrayNew.java
        ReturnIntArrayVar.java
        ReturnLongArrayNew.java
        ReturnLongArrayVar.java
        ReturnPostIncInt.java
        ReturnPreIncInt.java
        ReturnShortArrayNew.java
        ReturnShortArrayVar.java
        RightShifts.java
        Sequence.java
        StaticFieldStore.java
        StaticMethodCall.java
        SynchronizedMethods.java
        Ternary.java
        TernaryAnd.java
        TernaryAndOr.java
        TernaryNested.java
        TernaryOr.java
        TwoForLoops.java
        UnrelatedIfElsesWithCommonEndByte.java
        UnrelatedIfsWithCommonEndByte.java
        UnrelatedNestedIfElses.java
        UseObject.java
        UseObjectArrayLength.java
        UsesArrayLength.java
        UsesNew.java
        UsesThrow.java
        VarargsForEach.java
        VarargsSimple.java
        While.java
        WhileAndMandel.java
        WhileEmptyLoop.java
        WhileFloatCompound.java
        WhileIf.java
        WhileIfElse.java
        WhileWithoutMutator.java
        While_If_IfElseElse.java
        WideInc.java
        WideLoad.java
  - runtime
    - src
      - java
        com
        amd
        aparapi
        test
        runtime
        BufferTransfer.java
        CallStaticFromAnonymousKernel.java
        ExplicitBoolean.java
        ExplicitTransferTest.java
        Issue102.java
        Issue103.java
        Issue68.java
        Issue69.java
        LoadCL.java
        MultipleKernelCall.java
        RangeSize.java
        Test12x4_4x2.java
        UseStaticArray.java
        Util.java

/*
Copyright (c) 2010-2011, Advanced Micro Devices, Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following
disclaimer. 

Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided with the distribution. 

Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission. 

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export
laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 through
774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000.  Further, pursuant to Section 740.6 of the EAR,
you hereby certify that, except pursuant to a license granted by the United States Department of Commerce Bureau of 
Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export Administration 
Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in Country Groups D:1,
E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) export to Country Groups
D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced direct product is subject
to national security controls as identified on the Commerce Control List (currently found in Supplement 1 to Part 774
of EAR).  For the most current Country Group listings, or for additional information about the EAR or your obligations
under those regulations, please refer to the U.S. Bureau of Industry and Security's website at http://www.bis.doc.gov/. 

*/
package com.amd.aparapi.internal.kernel;

import com.amd.aparapi.*;
import com.amd.aparapi.Kernel.KernelState;
import com.amd.aparapi.annotation.Constant;
import com.amd.aparapi.annotation.Local;
import com.amd.aparapi.device.Device;
import com.amd.aparapi.device.OpenCLDevice;
import com.amd.aparapi.internal.exception.AparapiException;
import com.amd.aparapi.internal.exception.CodeGenException;
import com.amd.aparapi.internal.instruction.InstructionSet.TypeSpec;
import com.amd.aparapi.internal.jni.KernelRunnerJNI;
import com.amd.aparapi.internal.model.ClassModel;
import com.amd.aparapi.internal.model.Entrypoint;
import com.amd.aparapi.internal.util.UnsafeWrapper;
import com.amd.aparapi.internal.writer.KernelWriter;
import com.amd.aparapi.opencl.OpenCL;

import java.lang.reflect.Array;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.*;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * The class is responsible for executing <code>Kernel</code> implementations. <br/>
 *
 * The <code>KernelRunner</code> is the real workhorse for Aparapi.  Each <code>Kernel</code> instance creates a single
 * <code>KernelRunner</code> to encapsulate state and to help coordinate interactions between the <code>Kernel</code>
 * and it's execution logic.<br/>
 *
 * The <code>KernelRunner</code> is created <i>lazily</i> as a result of calling <code>Kernel.execute()</code>. A this
 * time the <code>ExecutionMode</code> is consulted to determine the default requested mode.  This will dictate how 
 * the <code>KernelRunner</code> will attempt to execute the <code>Kernel</code>
 *
 * @author gfrost
 *
 */
public class KernelRunner extends KernelRunnerJNI {

   private static Logger logger = Logger.getLogger(Config.getLoggerName());

   private OpenCLDevice lastGPUExecutionDevice = null;
   private Map<Class<? extends Kernel>, KernelMapping> kernelMappingMap = new HashMap<Class<? extends Kernel>, KernelMapping>();

   private final ExecutorService threadPool = Executors.newCachedThreadPool();

   private final LinkedHashSet<EXECUTION_MODE> executionModes = EXECUTION_MODE.getDefaultExecutionModes();
   private Iterator<EXECUTION_MODE> currentMode = executionModes.iterator();
   private EXECUTION_MODE executionMode = currentMode.next();

   private Set<String> capabilitiesSet;

   private long accumulatedExecutionTime = 0;
   private long conversionTime = 0;
   private long executionTime = 0;

   private final Set<Object> puts = new HashSet<Object>();

   private long kernelRunnerContextHandle = 0;


   /**
    * <code>Kernel.dispose()</code> delegates to <code>KernelRunner.dispose()</code> which delegates to
    * <code>disposeKernelRunnerJNI()</code> to actually close JNI data structures.<br/>
    *
    * @see #disposeKernelRunnerJNI
    */
   public void dispose() {
      if (getExecutionMode().isOpenCL()) {
         disposeKernelRunnerJNI(kernelRunnerContextHandle);
      }
      threadPool.shutdownNow();
   }

   boolean hasFP64Support() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return (capabilitiesSet.contains(OpenCL.CL_KHR_FP64));
   }

   boolean hasSelectFPRoundingModeSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_SELECT_FPROUNDING_MODE);
   }

   boolean hasGlobalInt32BaseAtomicsSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_BASE_ATOMICS);
   }

   boolean hasGlobalInt32ExtendedAtomicsSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS);
   }

   boolean hasLocalInt32BaseAtomicsSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_BASE_ATOMICS);
   }

   boolean hasLocalInt32ExtendedAtomicsSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS);
   }

   boolean hasInt64BaseAtomicsSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_BASE_ATOMICS);
   }

   boolean hasInt64ExtendedAtomicsSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_INT64_EXTENDED_ATOMICS);
   }

   boolean has3DImageWritesSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_3D_IMAGE_WRITES);
   }

   boolean hasByteAddressableStoreSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_BYTE_ADDRESSABLE_SUPPORT);
   }

   boolean hasFP16Support() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_FP16);
   }

   boolean hasGLSharingSupport() {
      if (capabilitiesSet == null) {
         throw new IllegalStateException("Capabilities queried before they were initialized");
      }
      return capabilitiesSet.contains(OpenCL.CL_KHR_GL_SHARING);
   }

   /**
    * Execute using a Java thread pool. Either because we were explicitly asked to do so, or because we 'fall back' after discovering an OpenCL issue.
    *
    * @param _range
    *          The globalSize requested by the user (via <code>Kernel.execute(globalSize)</code>)
    * @param _passes
    *          The # of passes requested by the user (via <code>Kernel.execute(globalSize, passes)</code>). Note this is usually defaulted to 1 via <code>Kernel.execute(globalSize)</code>.
    */
   private void executeJava(Kernel kernel, final Range _range, final int _passes) {
      if (logger.isLoggable(Level.FINE)) {
         logger.fine("executeJava: range = " + _range);
      }

      if (getExecutionMode().equals(EXECUTION_MODE.SEQ)) {
         /**
          * SEQ mode is useful for testing trivial logic, but kernels which use SEQ mode cannot be used if the
          * product of localSize(0..3) is >1.  So we can use multi-dim ranges but only if the local size is 1 in all dimensions. 
          *
          * As a result of this barrier is only ever 1 work item wide and probably should be turned into a no-op. 
          *
          * So we need to check if the range is valid here. If not we have no choice but to punt.
          */
         if ((_range.getLocalSize(0) * _range.getLocalSize(1) * _range.getLocalSize(2)) > 1) {
            throw new IllegalStateException("Can't run range with group size >1 sequentially. Barriers would deadlock!");
         }

         final Kernel kernelClone = kernel.clone();
         final KernelState kernelState = kernelClone.getKernelState();

         kernelState.setRange(_range);
         kernelState.setGroupId(0, 0);
         kernelState.setGroupId(1, 0);
         kernelState.setGroupId(2, 0);
         kernelState.setLocalId(0, 0);
         kernelState.setLocalId(1, 0);
         kernelState.setLocalId(2, 0);
         kernelState.setLocalBarrier(new CyclicBarrier(1));

         for (int passId = 0; passId < _passes; passId++) {
            kernelState.setPassId(passId);

            if (_range.getDims() == 1) {
               for (int id = 0; id < _range.getGlobalSize(0); id++) {
                  kernelState.setGlobalId(0, id);
                  kernelClone.run();
               }
            } else if (_range.getDims() == 2) {
               for (int x = 0; x < _range.getGlobalSize(0); x++) {
                  kernelState.setGlobalId(0, x);

                  for (int y = 0; y < _range.getGlobalSize(1); y++) {
                     kernelState.setGlobalId(1, y);
                     kernelClone.run();
                  }
               }
            } else if (_range.getDims() == 3) {
               for (int x = 0; x < _range.getGlobalSize(0); x++) {
                  kernelState.setGlobalId(0, x);

                  for (int y = 0; y < _range.getGlobalSize(1); y++) {
                     kernelState.setGlobalId(1, y);

                     for (int z = 0; z < _range.getGlobalSize(2); z++) {
                        kernelState.setGlobalId(2, z);
                        kernelClone.run();
                     }

                     kernelClone.run();
                  }
               }
            }
         }
      } else {
         final int threads = _range.getLocalSize(0) * _range.getLocalSize(1) * _range.getLocalSize(2);
         final int globalGroups = _range.getNumGroups(0) * _range.getNumGroups(1) * _range.getNumGroups(2);
         /**
          * This joinBarrier is the barrier that we provide for the kernel threads to rendezvous with the current dispatch thread.
          * So this barrier is threadCount+1 wide (the +1 is for the dispatch thread)
          */
         final CyclicBarrier joinBarrier = new CyclicBarrier(threads + 1);

         /**
          * This localBarrier is only ever used by the kernels.  If the kernel does not use the barrier the threads 
          * can get out of sync, we promised nothing in JTP mode.
          *
          * As with OpenCL all threads within a group must wait at the barrier or none.  It is a user error (possible deadlock!)
          * if the barrier is in a conditional that is only executed by some of the threads within a group.
          *
          * Kernel developer must understand this.
          *
          * This barrier is threadCount wide.  We never hit the barrier from the dispatch thread.
          */
         final CyclicBarrier localBarrier = new CyclicBarrier(threads);

         for (int passId = 0; passId < _passes; passId++) {
            /**
              * Note that we emulate OpenCL by creating one thread per localId (across the group).
              *
              * So threadCount == range.getLocalSize(0)*range.getLocalSize(1)*range.getLocalSize(2);
              *
              * For a 1D range of 12 groups of 4 we create 4 threads. One per localId(0).
              *
              * We also clone the kernel 4 times. One per thread.
              *
              * We create local barrier which has a width of 4
              *
              *    Thread-0 handles localId(0) (global 0,4,8)
              *    Thread-1 handles localId(1) (global 1,5,7)
              *    Thread-2 handles localId(2) (global 2,6,10)
              *    Thread-3 handles localId(3) (global 3,7,11)
              *
              * This allows all threads to synchronize using the local barrier.
              *
              * Initially the use of local buffers seems broken as the buffers appears to be per Kernel.
              * Thankfully Kernel.clone() performs a shallow clone of all buffers (local and global)
              * So each of the cloned kernels actually still reference the same underlying local/global buffers.
              *
              * If the kernel uses local buffers but does not use barriers then it is possible for different groups
              * to see mutations from each other (unlike OpenCL), however if the kernel does not us barriers then it
              * cannot assume any coherence in OpenCL mode either (the failure mode will be different but still wrong)
              *
              * So even JTP mode use of local buffers will need to use barriers. Not for the same reason as OpenCL but to keep groups in lockstep.
              *
              **/
            for (int id = 0; id < threads; id++) {
               final int threadId = id;

               /**
                *  We clone one kernel for each thread.
                *
                *  They will all share references to the same range, localBarrier and global/local buffers because the clone is shallow.
                *  We need clones so that each thread can assign 'state' (localId/globalId/groupId) without worrying 
                *  about other threads.   
                */
               final Kernel kernelClone = kernel.clone();
               final KernelState kernelState = kernelClone.getKernelState();

               kernelState.setRange(_range);
               kernelState.setLocalBarrier(localBarrier);
               kernelState.setPassId(passId);

               threadPool.submit(new Runnable(){
                  @Override public void run() {
                     for (int globalGroupId = 0; globalGroupId < globalGroups; globalGroupId++) {

                        if (_range.getDims() == 1) {
                           kernelState.setLocalId(0, (threadId % _range.getLocalSize(0)));
                           kernelState.setGlobalId(0, (threadId + (globalGroupId * threads)));
                           kernelState.setGroupId(0, globalGroupId);
                        } else if (_range.getDims() == 2) {

                           /**
                            * Consider a 12x4 grid of 4*2 local groups
                            * <pre>
                            *                                             threads = 4*2 = 8
                            *                                             localWidth=4
                            *                                             localHeight=2
                            *                                             globalWidth=12
                            *                                             globalHeight=4
                            *
                            *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  
                            *    12 13 14 15 | 16 17 18 19 | 20 21 22 23
                            *    ------------+-------------+------------
                            *    24 25 26 27 | 28 29 30 31 | 32 33 34 35
                            *    36 37 38 39 | 40 41 42 43 | 44 45 46 47  
                            *
                            *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  threadIds : [0..7]*6
                            *    04 05 06 07 | 04 05 06 07 | 04 05 06 07
                            *    ------------+-------------+------------
                            *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
                            *    04 05 06 07 | 04 05 06 07 | 04 05 06 07  
                            *
                            *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  groupId[0] : 0..6 
                            *    00 00 00 00 | 01 01 01 01 | 02 02 02 02   
                            *    ------------+-------------+------------
                            *    00 00 00 00 | 01 01 01 01 | 02 02 02 02  
                            *    00 00 00 00 | 01 01 01 01 | 02 02 02 02
                            *
                            *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  groupId[1] : 0..6 
                            *    00 00 00 00 | 00 00 00 00 | 00 00 00 00   
                            *    ------------+-------------+------------
                            *    01 01 01 01 | 01 01 01 01 | 01 01 01 01 
                            *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
                            *
                            *    00 01 02 03 | 08 09 10 11 | 16 17 18 19  globalThreadIds == threadId + groupId * threads;
                            *    04 05 06 07 | 12 13 14 15 | 20 21 22 23
                            *    ------------+-------------+------------
                            *    24 25 26 27 | 32[33]34 35 | 40 41 42 43
                            *    28 29 30 31 | 36 37 38 39 | 44 45 46 47   
                            *
                            *    00 01 02 03 | 00 01 02 03 | 00 01 02 03  localX = threadId % localWidth; (for globalThreadId 33 = threadId = 01 : 01%4 =1)
                            *    00 01 02 03 | 00 01 02 03 | 00 01 02 03   
                            *    ------------+-------------+------------
                            *    00 01 02 03 | 00[01]02 03 | 00 01 02 03 
                            *    00 01 02 03 | 00 01 02 03 | 00 01 02 03
                            *
                            *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  localY = threadId /localWidth  (for globalThreadId 33 = threadId = 01 : 01/4 =0)
                            *    01 01 01 01 | 01 01 01 01 | 01 01 01 01   
                            *    ------------+-------------+------------
                            *    00 00 00 00 | 00[00]00 00 | 00 00 00 00 
                            *    01 01 01 01 | 01 01 01 01 | 01 01 01 01
                            *
                            *    00 01 02 03 | 04 05 06 07 | 08 09 10 11  globalX=
                            *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     groupsPerLineWidth=globalWidth/localWidth (=12/4 =3)
                            *    ------------+-------------+------------     groupInset =groupId%groupsPerLineWidth (=4%3 = 1)
                            *    00 01 02 03 | 04[05]06 07 | 08 09 10 11 
                            *    00 01 02 03 | 04 05 06 07 | 08 09 10 11     globalX = groupInset*localWidth+localX (= 1*4+1 = 5)
                            *
                            *    00 00 00 00 | 00 00 00 00 | 00 00 00 00  globalY
                            *    01 01 01 01 | 01 01 01 01 | 01 01 01 01      
                            *    ------------+-------------+------------
                            *    02 02 02 02 | 02[02]02 02 | 02 02 02 02 
                            *    03 03 03 03 | 03 03 03 03 | 03 03 03 03
                            *
                            * </pre>
                            * Assume we are trying to locate the id's for #33 
                            *
                            */

                           kernelState.setLocalId(0, (threadId % _range.getLocalSize(0))); // threadId % localWidth =  (for 33 = 1 % 4 = 1)
                           kernelState.setLocalId(1, (threadId / _range.getLocalSize(0))); // threadId / localWidth = (for 33 = 1 / 4 == 0)

                           final int groupInset = globalGroupId % _range.getNumGroups(0); // 4%3 = 1
                           kernelState.setGlobalId(0, ((groupInset * _range.getLocalSize(0)) + kernelState.getLocalIds()[0])); // 1*4+1=5

                           final int completeLines = (globalGroupId / _range.getNumGroups(0)) * _range.getLocalSize(1);// (4/3) * 2
                           kernelState.setGlobalId(1, (completeLines + kernelState.getLocalIds()[1])); // 2+0 = 2
                           kernelState.setGroupId(0, (globalGroupId % _range.getNumGroups(0)));
                           kernelState.setGroupId(1, (globalGroupId / _range.getNumGroups(0)));
                        } else if (_range.getDims() == 3) {

                           //Same as 2D actually turns out that localId[0] is identical for all three dims so could be hoisted out of conditional code

                           kernelState.setLocalId(0, (threadId % _range.getLocalSize(0)));

                           kernelState.setLocalId(1, ((threadId / _range.getLocalSize(0)) % _range.getLocalSize(1)));

                           // the thread id's span WxHxD so threadId/(WxH) should yield the local depth  
                           kernelState.setLocalId(2, (threadId / (_range.getLocalSize(0) * _range.getLocalSize(1))));

                           kernelState.setGlobalId(
                                 0,
                                 (((globalGroupId % _range.getNumGroups(0)) * _range.getLocalSize(0)) + kernelState.getLocalIds()[0]));

                           kernelState.setGlobalId(
                                 1,
                                 ((((globalGroupId / _range.getNumGroups(0)) * _range.getLocalSize(1)) % _range.getGlobalSize(1)) + kernelState
                                       .getLocalIds()[1]));

                           kernelState.setGlobalId(
                                 2,
                                 (((globalGroupId / (_range.getNumGroups(0) * _range.getNumGroups(1))) * _range.getLocalSize(2)) + kernelState
                                       .getLocalIds()[2]));

                           kernelState.setGroupId(0, (globalGroupId % _range.getNumGroups(0)));
                           kernelState.setGroupId(1, ((globalGroupId / _range.getNumGroups(0)) % _range.getNumGroups(1)));
                           kernelState.setGroupId(2, (globalGroupId / (_range.getNumGroups(0) * _range.getNumGroups(1))));
                        }

                        kernelClone.run();
                     }

                     await(joinBarrier); // This thread will rendezvous with dispatch thread here. This is effectively a join.                  
                  }
               });
            }

            await(joinBarrier); // This dispatch thread waits for all worker threads here. 
         }
      } // execution mode == JTP
   }

   private static void await(CyclicBarrier _barrier) {
      try {
         _barrier.await();
      } catch (final InterruptedException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
      } catch (final BrokenBarrierException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
      }
   }

   private boolean usesOopConversion = false;

   /**
    *
    * @param entryPoint
    * @param kernel
    * @param arg
    * @return
    * @throws AparapiException
    */
   private boolean prepareOopConversionBuffer(Entrypoint entryPoint, Kernel kernel, KernelArg arg) throws AparapiException {
      usesOopConversion = true;
      final Class<?> arrayClass = arg.getField().getType();
      ClassModel c = null;
      boolean didReallocate = false;

      if (arg.getObjArrayElementModel() == null) {
         final String tmp = arrayClass.getName().substring(2).replace("/", ".");
         final String arrayClassInDotForm = tmp.substring(0, tmp.length() - 1);

         if (logger.isLoggable(Level.FINE)) {
            logger.fine("looking for type = " + arrayClassInDotForm);
         }

         // get ClassModel of obj array from entrypt.objectArrayFieldsClasses
         c = entryPoint.getObjectArrayFieldsClasses().get(arrayClassInDotForm);
         arg.setObjArrayElementModel(c);
      } else {
         c = arg.getObjArrayElementModel();
      }
      assert c != null : "should find class for elements " + arrayClass.getName();

      final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass);
      final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass);

      if (logger.isLoggable(Level.FINEST)) {
         logger.finest("Syncing obj array type = " + arrayClass + " cvtd= " + c.getClassWeAreModelling().getName()
               + "arrayBaseOffset=" + arrayBaseOffset + " arrayScale=" + arrayScale);
      }

      int objArraySize = 0;
      Object newRef = null;
      try {
         newRef = arg.getField().get(kernel);
         objArraySize = Array.getLength(newRef);
      } catch (final IllegalAccessException e) {
         throw new AparapiException(e);
      }

      assert (newRef != null) && (objArraySize != 0) : "no data";

      final int totalStructSize = c.getTotalStructSize();
      final int totalBufferSize = objArraySize * totalStructSize;

      // allocate ByteBuffer if first time or array changed
      if ((arg.getObjArrayBuffer() == null) || (newRef != arg.getArray())) {
         final ByteBuffer structBuffer = ByteBuffer.allocate(totalBufferSize);
         arg.setObjArrayByteBuffer(structBuffer.order(ByteOrder.LITTLE_ENDIAN));
         arg.setObjArrayBuffer(arg.getObjArrayByteBuffer().array());
         didReallocate = true;
         if (logger.isLoggable(Level.FINEST)) {
            logger.finest("objArraySize = " + objArraySize + " totalStructSize= " + totalStructSize + " totalBufferSize="
                  + totalBufferSize);
         }
      } else {
         arg.getObjArrayByteBuffer().clear();
      }

      // copy the fields that the JNI uses
      arg.setJavaArray(arg.getObjArrayBuffer());
      arg.setNumElements(objArraySize);
      arg.setSizeInBytes(totalBufferSize);

      for (int j = 0; j < objArraySize; j++) {
         int sizeWritten = 0;

         final Object object = UnsafeWrapper.getObject(newRef, arrayBaseOffset + (arrayScale * j));
         for (int i = 0; i < c.getStructMemberTypes().size(); i++) {
            final TypeSpec t = c.getStructMemberTypes().get(i);
            final long offset = c.getStructMemberOffsets().get(i);

            if (logger.isLoggable(Level.FINEST)) {
               logger.finest("name = " + c.getStructMembers().get(i).getNameAndTypeEntry().getNameUTF8Entry().getUTF8() + " t= "
                     + t);
            }

            switch (t) {
               case I: {
                  final int x = UnsafeWrapper.getInt(object, offset);
                  arg.getObjArrayByteBuffer().putInt(x);
                  sizeWritten += t.getSize();
                  break;
               }
               case F: {
                  final float x = UnsafeWrapper.getFloat(object, offset);
                  arg.getObjArrayByteBuffer().putFloat(x);
                  sizeWritten += t.getSize();
                  break;
               }
               case J: {
                  final long x = UnsafeWrapper.getLong(object, offset);
                  arg.getObjArrayByteBuffer().putLong(x);
                  sizeWritten += t.getSize();
                  break;
               }
               case Z: {
                  final boolean x = UnsafeWrapper.getBoolean(object, offset);
                  arg.getObjArrayByteBuffer().put(x == true ? (byte) 1 : (byte) 0);
                  // Booleans converted to 1 byte C chars for opencl
                  sizeWritten += TypeSpec.B.getSize();
                  break;
               }
               case B: {
                  final byte x = UnsafeWrapper.getByte(object, offset);
                  arg.getObjArrayByteBuffer().put(x);
                  sizeWritten += t.getSize();
                  break;
               }
               case D: {
                  throw new AparapiException("Double not implemented yet");
               }
               default:
                  assert true == false : "typespec did not match anything";
                  throw new AparapiException("Unhandled type in buffer conversion");
            }
         }

         // add padding here if needed
         if (logger.isLoggable(Level.FINEST)) {
            logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize);
         }

         assert sizeWritten <= totalStructSize : "wrote too much into buffer";

         while (sizeWritten < totalStructSize) {
            if (logger.isLoggable(Level.FINEST)) {
               logger.finest(arg.getName() + " struct pad byte = " + sizeWritten + " totalStructSize= " + totalStructSize);
            }
            arg.getObjArrayByteBuffer().put((byte) -1);
            sizeWritten++;
         }
      }

      assert arg.getObjArrayByteBuffer().arrayOffset() == 0 : "should be zero";

      return didReallocate;
   }

   private void extractOopConversionBuffer(Kernel kernel, KernelArg arg) throws AparapiException {
      final Class<?> arrayClass = arg.getField().getType();
      final ClassModel c = arg.getObjArrayElementModel();
      assert c != null : "should find class for elements: " + arrayClass.getName();
      assert arg.getArray() != null : "array is null";

      final int arrayBaseOffset = UnsafeWrapper.arrayBaseOffset(arrayClass);
      final int arrayScale = UnsafeWrapper.arrayIndexScale(arrayClass);
      if (logger.isLoggable(Level.FINEST)) {
         logger.finest("Syncing field:" + arg.getName() + ", bb=" + arg.getObjArrayByteBuffer() + ", type = " + arrayClass);
      }

      int objArraySize = 0;
      try {
         objArraySize = Array.getLength(arg.getField().get(kernel));
      } catch (final IllegalAccessException e) {
         throw new AparapiException(e);
      }

      assert objArraySize > 0 : "should be > 0";

      final int totalStructSize = c.getTotalStructSize();
      // int totalBufferSize = objArraySize * totalStructSize;
      // assert arg.objArrayBuffer.length == totalBufferSize : "size should match";

      arg.getObjArrayByteBuffer().rewind();

      for (int j = 0; j < objArraySize; j++) {
         int sizeWritten = 0;
         final Object object = UnsafeWrapper.getObject(arg.getArray(), arrayBaseOffset + (arrayScale * j));
         for (int i = 0; i < c.getStructMemberTypes().size(); i++) {
            final TypeSpec t = c.getStructMemberTypes().get(i);
            final long offset = c.getStructMemberOffsets().get(i);
            switch (t) {
               case I: {
                  // read int value from buffer and store into obj in the array
                  final int x = arg.getObjArrayByteBuffer().getInt();
                  if (logger.isLoggable(Level.FINEST)) {
                     logger.finest("fType = " + t.getShortName() + " x= " + x);
                  }
                  UnsafeWrapper.putInt(object, offset, x);
                  sizeWritten += t.getSize();
                  break;
               }
               case F: {
                  final float x = arg.getObjArrayByteBuffer().getFloat();
                  if (logger.isLoggable(Level.FINEST)) {
                     logger.finest("fType = " + t.getShortName() + " x= " + x);
                  }
                  UnsafeWrapper.putFloat(object, offset, x);
                  sizeWritten += t.getSize();
                  break;
               }
               case J: {
                  final long x = arg.getObjArrayByteBuffer().getLong();
                  if (logger.isLoggable(Level.FINEST)) {
                     logger.finest("fType = " + t.getShortName() + " x= " + x);
                  }
                  UnsafeWrapper.putLong(object, offset, x);
                  sizeWritten += t.getSize();
                  break;
               }
               case Z: {
                  final byte x = arg.getObjArrayByteBuffer().get();
                  if (logger.isLoggable(Level.FINEST)) {
                     logger.finest("fType = " + t.getShortName() + " x= " + x);
                  }
                  UnsafeWrapper.putBoolean(object, offset, (x == 1 ? true : false));
                  // Booleans converted to 1 byte C chars for open cl
                  sizeWritten += TypeSpec.B.getSize();
                  break;
               }
               case B: {
                  final byte x = arg.getObjArrayByteBuffer().get();
                  if (logger.isLoggable(Level.FINEST)) {
                     logger.finest("fType = " + t.getShortName() + " x= " + x);
                  }
                  UnsafeWrapper.putByte(object, offset, x);
                  sizeWritten += t.getSize();
                  break;
               }
               case D: {
                  throw new AparapiException("Double not implemented yet");
               }
               default:
                  assert true == false : "typespec did not match anything";
                  throw new AparapiException("Unhandled type in buffer conversion");
            }
         }

         // add padding here if needed
         if (logger.isLoggable(Level.FINEST)) {
            logger.finest("sizeWritten = " + sizeWritten + " totalStructSize= " + totalStructSize);
         }

         assert sizeWritten <= totalStructSize : "wrote too much into buffer";

         while (sizeWritten < totalStructSize) {
            // skip over pad bytes
            arg.getObjArrayByteBuffer().get();
            sizeWritten++;
         }
      }
   }

   private void restoreObjects(KernelMapping kernelMapping, Kernel kernel) throws AparapiException {
      for (KernelArg arg : kernelMapping.kernelArgs) {
         if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) {
            extractOopConversionBuffer(kernel, arg);
         }
      }
   }

   private boolean updateKernelArrayRefs(KernelMapping kernelMapping, Kernel kernel) throws AparapiException {
      boolean needsSync = false;

      for (KernelArg arg : kernelMapping.kernelArgs) {
         try {
            if ((arg.getType() & ARG_ARRAY) != 0) {
               Object newArrayRef;
               newArrayRef = arg.getField().get(kernel);

               if (newArrayRef == null) {
                  throw new IllegalStateException("Cannot send null refs to kernel, reverting to java");
               }

               if ((arg.getType() & ARG_OBJ_ARRAY_STRUCT) != 0) {
                  prepareOopConversionBuffer(kernelMapping.entryPoint, kernel, arg);
               } else {
                  // set up JNI fields for normal arrays
                  arg.setJavaArray(newArrayRef);
                  arg.setNumElements(Array.getLength(newArrayRef));
                  arg.setSizeInBytes(arg.getNumElements() * arg.getPrimitiveSize());

                  if (((arg.getType() & ARG_EXPLICIT) != 0) && puts.contains(newArrayRef)) {
                     arg.setType(arg.getType() | ARG_EXPLICIT_WRITE);
                     // System.out.println("detected an explicit write " + args[i].name);
                     puts.remove(newArrayRef);
                  }
               }

               if (newArrayRef != arg.getArray()) {
                  needsSync = true;

                  if (logger.isLoggable(Level.FINE)) {
                     logger.fine("saw newArrayRef for " + arg.getName() + " = " + newArrayRef + ", newArrayLen = "
                           + Array.getLength(newArrayRef));
                  }
               }

               arg.setArray(newArrayRef);
               assert arg.getArray() != null : "null array ref";
            }
         } catch (final IllegalArgumentException e) {
            logger.log(Level.SEVERE, "IllegalArgumentException during update kernel refs", e);
         } catch (final IllegalAccessException e) {
            logger.log(Level.SEVERE, "IllegalAccessException during update kernel refs", e);
         }
      }
      return needsSync;
   }

   private KernelRunner executeOpenCL(final Kernel kernel, KernelMapping kernelMapping,
                                      final Range _range, final int _passes) throws AparapiException {
      /*
      if (_range.getDims() > getMaxWorkItemDimensionsJNI(kernelContextHandle)) {
         throw new RangeException("Range dim size " + _range.getDims() + " > device "
               + getMaxWorkItemDimensionsJNI(kernelContextHandle));
      }
      if (_range.getWorkGroupSize() > getMaxWorkGroupSizeJNI(kernelContextHandle)) {
         throw new RangeException("Range workgroup size " + _range.getWorkGroupSize() + " > device "
               + getMaxWorkGroupSizeJNI(kernelContextHandle));
      }
      
            if (_range.getGlobalSize(0) > getMaxWorkItemSizeJNI(kernelContextHandle, 0)) {
               throw new RangeException("Range globalsize 0 " + _range.getGlobalSize(0) + " > device "
                     + getMaxWorkItemSizeJNI(kernelContextHandle, 0));
            }
            if (_range.getDims() > 1) {
               if (_range.getGlobalSize(1) > getMaxWorkItemSizeJNI(kernelContextHandle, 1)) {
                  throw new RangeException("Range globalsize 1 " + _range.getGlobalSize(1) + " > device "
                        + getMaxWorkItemSizeJNI(kernelContextHandle, 1));
               }
               if (_range.getDims() > 2) {
                  if (_range.getGlobalSize(2) > getMaxWorkItemSizeJNI(kernelContextHandle, 2)) {
                     throw new RangeException("Range globalsize 2 " + _range.getGlobalSize(2) + " > device "
                           + getMaxWorkItemSizeJNI(kernelContextHandle, 2));
                  }
               }
            }
      

      if (logger.isLoggable(Level.FINE)) {
         logger.fine("maxComputeUnits=" + this.getMaxComputeUnitsJNI(kernelContextHandle));
         logger.fine("maxWorkGroupSize=" + this.getMaxWorkGroupSizeJNI(kernelContextHandle));
         logger.fine("maxWorkItemDimensions=" + this.getMaxWorkItemDimensionsJNI(kernelContextHandle));
         logger.fine("maxWorkItemSize(0)=" + getMaxWorkItemSizeJNI(kernelContextHandle, 0));
         if (_range.getDims() > 1) {
            logger.fine("maxWorkItemSize(1)=" + getMaxWorkItemSizeJNI(kernelContextHandle, 1));
            if (_range.getDims() > 2) {
               logger.fine("maxWorkItemSize(2)=" + getMaxWorkItemSizeJNI(kernelContextHandle, 2));
            }
         }
      }
      */

      // explicit reference check, no equals!
      if (kernel != kernelMapping.getLastKernel()) {
         updateKernelJNI(kernelMapping.kernelContextHandle, kernel);
      }

      // Read the array refs after kernel may have changed them
      // We need to do this as input to computing the localSize
      final boolean needSync = updateKernelArrayRefs(kernelMapping, kernel);
      if (needSync && logger.isLoggable(Level.FINE)) {
         logger.fine("Need to resync arrays on " + kernel.getClass().getName());
      }

      // native side will reallocate array buffers if necessary
      if (runKernelJNI(kernelRunnerContextHandle, kernelMapping.kernelContextHandle, _range, needSync, _passes) != 0) {
         logger.warning("### CL exec seems to have failed. Trying to revert to Java ###");
         setFallbackExecutionMode();
         return execute(kernel, _range, _passes);
      }

      if (usesOopConversion) {
         restoreObjects(kernelMapping, kernel);
      }

      if (logger.isLoggable(Level.FINE)) {
         logger.fine("executeOpenCL completed. " + _range);
      }

      return this;
   }

   synchronized private KernelRunner fallBackAndExecute(Kernel kernel, final Range _range, final int _passes) {
      if (hasNextExecutionMode()) {
         tryNextExecutionMode();
      } else {
         setFallbackExecutionMode();
      }

      return execute(kernel, _range, _passes);
   }

   synchronized private KernelRunner warnFallBackAndExecute(Kernel kernel, final Range _range, final int _passes,
         Exception _exception) {
      if (logger.isLoggable(Level.WARNING)) {
         logger.warning("Reverting to Java Thread Pool (JTP) for " + kernel.getClass() + ": " + _exception.getMessage());
         _exception.printStackTrace();
      }
      return fallBackAndExecute(kernel, _range, _passes);
   }

   synchronized private KernelRunner warnFallBackAndExecute(Kernel kernel, final Range _range, final int _passes, String _excuse) {
      logger.warning("Reverting to Java Thread Pool (JTP) for " + kernel.getClass() + ": " + _excuse);
      return fallBackAndExecute(kernel, _range, _passes);
   }

   public synchronized KernelRunner execute(Kernel kernel, int globalSize) {
      return execute(kernel, globalSize, 1);
   }

   public synchronized KernelRunner execute(Kernel kernel, Range range) {
      return execute(kernel, range, 1);
   }

   public synchronized KernelRunner execute(Kernel kernel, int globalSize, int passes) {
      return execute(kernel, Range.create(Device.best(), globalSize), passes);
   }

   private void initKernelRunnerContextHandle(OpenCLDevice device, int flags) {
      if (kernelRunnerContextHandle != 0) return;
      kernelRunnerContextHandle = initKernelRunnerJNI(device, flags);
   }

   public synchronized KernelRunner execute(Kernel kernel, final Range _range, final int _passes) {


      long executeStartTime = System.currentTimeMillis();

      if (_range == null) {
         throw new IllegalStateException("range can't be null");
      }

      /* for backward compatibility reasons we still honor execution mode */
      if (getExecutionMode().isOpenCL()) {

         KernelMapping currentKernelMapping = kernelMappingMap.get(kernel.getClass());

         // See if user supplied a Device
         Device device = _range.getDevice();

         if ((device == null) || (device instanceof OpenCLDevice)) {
            if (currentKernelMapping == null) {
               try {
                  final ClassModel classModel = new ClassModel(kernel.getClass());
                  Entrypoint entryPoint = classModel.getEntrypoint(kernel);
                  currentKernelMapping = new KernelMapping(kernel.getClass(), entryPoint, kernel);
                  kernelMappingMap.put(kernel.getClass(), currentKernelMapping);
               } catch (final Exception exception) {
                  return warnFallBackAndExecute(kernel, _range, _passes, exception);
               }

               Entrypoint entryPoint = currentKernelMapping.entryPoint;
               if ((entryPoint != null) && !entryPoint.shouldFallback()) {
                  synchronized (Kernel.class) { // This seems to be needed because of a race condition uncovered with issue #68 http://code.google.com/p/aparapi/issues/detail?id=68

                     OpenCLDevice openCLDevice = (OpenCLDevice) device; // still might be null! 
                     if (openCLDevice == null && lastGPUExecutionDevice != null) {
                        openCLDevice = lastGPUExecutionDevice;
                     } else if (openCLDevice != null && lastGPUExecutionDevice != null && ! openCLDevice.equals(lastGPUExecutionDevice)) {
                        logger.severe("expected execution device: " + lastGPUExecutionDevice.toString());
                        logger.severe("current execution device: " + openCLDevice.toString());
                        throw new IllegalArgumentException("GPU device can only be set once! Please always " +
                            "use the same device!");
                     }

                     int jniFlags = 0;
                     if (openCLDevice == null) {
                        if (getExecutionMode().equals(EXECUTION_MODE.GPU)) {
                           // We used to treat as before by getting first GPU device
                           // now we get the best GPU
                           openCLDevice = (OpenCLDevice) OpenCLDevice.best();
                           jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. 
                        } else {
                           // We fetch the first CPU device 
                           openCLDevice = (OpenCLDevice) OpenCLDevice.firstCPU();
                           if (openCLDevice == null) {
                              return warnFallBackAndExecute(kernel, _range, _passes,
                                    "CPU request can't be honored not CPU device");
                           }
                        }
                     } else {
                        if (openCLDevice.getType() == Device.TYPE.GPU) {
                           jniFlags |= JNI_FLAG_USE_GPU; // this flag might be redundant now. 
                        }
                     }

                     initKernelRunnerContextHandle(openCLDevice, jniFlags);
                     lastGPUExecutionDevice = openCLDevice;

                     //  jniFlags |= (Config.enableProfiling ? JNI_FLAG_ENABLE_PROFILING : 0);
                     //  jniFlags |= (Config.enableProfilingCSV ? JNI_FLAG_ENABLE_PROFILING_CSV | JNI_FLAG_ENABLE_PROFILING : 0);
                     //  jniFlags |= (Config.enableVerboseJNI ? JNI_FLAG_ENABLE_VERBOSE_JNI : 0);
                     // jniFlags |= (Config.enableVerboseJNIOpenCLResourceTracking ? JNI_FLAG_ENABLE_VERBOSE_JNI_OPENCL_RESOURCE_TRACKING :0);
                     // jniFlags |= (kernel.getExecutionMode().equals(EXECUTION_MODE.GPU) ? JNI_FLAG_USE_GPU : 0);
                     // Init the device to check capabilities before emitting the
                     // code that requires the capabilities.

                     // synchronized(Kernel.class){
                     currentKernelMapping.kernelContextHandle = initKernelJNI(kernelRunnerContextHandle, kernel);
                  } // end of synchronized! issue 68

                  if (currentKernelMapping.kernelContextHandle == 0) {
                     return warnFallBackAndExecute(kernel, _range, _passes, "initJNI failed to return a valid handle");
                  }

                  final String extensions = getExtensionsJNI(kernelRunnerContextHandle);
                  capabilitiesSet = new HashSet<String>();

                  final StringTokenizer strTok = new StringTokenizer(extensions);
                  while (strTok.hasMoreTokens()) {
                     capabilitiesSet.add(strTok.nextToken());
                  }

                  if (logger.isLoggable(Level.FINE)) {
                     logger.fine("Capabilities initialized to :" + capabilitiesSet.toString());
                  }

                  if (entryPoint.requiresDoublePragma() && !hasFP64Support()) {
                     return warnFallBackAndExecute(kernel, _range, _passes, "FP64 required but not supported");
                  }

                  if (entryPoint.requiresByteAddressableStorePragma() && !hasByteAddressableStoreSupport()) {
                     return warnFallBackAndExecute(kernel, _range, _passes,
                           "Byte addressable stores required but not supported");
                  }

                  final boolean all32AtomicsAvailable = hasGlobalInt32BaseAtomicsSupport()
                        && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport()
                        && hasLocalInt32ExtendedAtomicsSupport();

                  if (entryPoint.requiresAtomic32Pragma() && !all32AtomicsAvailable) {

                     return warnFallBackAndExecute(kernel, _range, _passes, "32 bit Atomics required but not supported");
                  }

                  String openCL;
                  try {
                     openCL = KernelWriter.writeToString(entryPoint);
                  } catch (final CodeGenException codeGenException) {
                     return warnFallBackAndExecute(kernel, _range, _passes, codeGenException);
                  }

                  if (Config.enableShowGeneratedOpenCL) {
                     System.out.println(openCL);
                  }

                  if (logger.isLoggable(Level.INFO)) {
                     logger.info(openCL);
                  }

                  // Send the string to OpenCL to compile it
                  if (buildProgramJNI(kernelRunnerContextHandle, currentKernelMapping.kernelContextHandle, openCL) == 0) {
                     return warnFallBackAndExecute(kernel, _range, _passes, "OpenCL compile failed");
                  }

                  List<KernelArg> kernelArgs = findOutKernelArgsIn(entryPoint, kernel);
                  if (kernelArgs == null) {
                     return fallBackAndExecute(kernel, _range, _passes);
                  }

                  currentKernelMapping.kernelArgs.addAll(kernelArgs);

                  // at this point, i = the actual used number of arguments
                  // (private buffers do not get treated as arguments)

                  KernelArg[] kernelArgsArray = currentKernelMapping.kernelArgsAsArray();
                  setArgsJNI(kernelRunnerContextHandle, currentKernelMapping.kernelContextHandle,
                        kernelArgsArray, kernelArgsArray.length);

                  conversionTime = System.currentTimeMillis() - executeStartTime;

                  try {
                     executeOpenCL(kernel, currentKernelMapping, _range, _passes);
                  } catch (final AparapiException e) {
                     warnFallBackAndExecute(kernel, _range, _passes, e);
                  }
               } else {
                  warnFallBackAndExecute(kernel, _range, _passes, "failed to locate entrypoint");
               }
            } else {
               try {
                  executeOpenCL(kernel, currentKernelMapping, _range, _passes);
               } catch (final AparapiException e) {
                  warnFallBackAndExecute(kernel, _range, _passes, e);
               }
            }
         } else {
            warnFallBackAndExecute(kernel, _range, _passes,
                  "OpenCL was requested but Device supplied was not an OpenCLDevice");
         }
      } else {
         executeJava(kernel, _range, _passes);
      }

      if (Config.enableExecutionModeReporting) {
         System.out.println(kernel.getClass().getCanonicalName() + ":" + getExecutionMode());
      }

      executionTime = System.currentTimeMillis() - executeStartTime;
      accumulatedExecutionTime += executionTime;

      return this;
   }

   /**
    * Turns all referenced fields in a given entryPoint and kernel into a list of {@link KernelArg}s.
    * @param entryPoint entryPoint
    * @param kernel kernel
    * @return list of referenced fields in the form of a list of {@link KernelArg}
    */
   private List<KernelArg> findOutKernelArgsIn(Entrypoint entryPoint, Kernel kernel) {
      List<KernelArg> resultArgs = new ArrayList<KernelArg>();

      for (final Field field : entryPoint.getReferencedFields()) {
         try {
            field.setAccessible(true);
            KernelArg currentArgument = fieldToKernelArg(entryPoint, kernel, field);
            if (currentArgument == null) return null;

            resultArgs.add(currentArgument);
         } catch (final IllegalArgumentException e) {
            logger.log(Level.SEVERE, "IllegalArgumentException encountered during handling of field " + field.toString(), e);
         }
      }

      return resultArgs;
   }

   /**
    * Fills a new {@link KernelArg} object with data from a given {@link Entrypoint}, {@link Kernel} and {@link Field}.
    * @param entryPoint entryPoint
    * @param kernel kernel
    * @param field field
    * @return filled {@link KernelArg}
    */
   private KernelArg fieldToKernelArg(Entrypoint entryPoint, Kernel kernel, Field field) {
      KernelArg currentArgument = new KernelArg();
      currentArgument.setName(field.getName());
      currentArgument.setField(field);
      if ((field.getModifiers() & Modifier.STATIC) == Modifier.STATIC) {
         currentArgument.setType(currentArgument.getType() | ARG_STATIC);
      }

      final Class<?> type = field.getType();
      if (type.isArray()) {

         if (! handleArrayTypeKernelArg(entryPoint, kernel, field, currentArgument, type)) return null;
      } else if (type.isAssignableFrom(float.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_FLOAT);
      } else if (type.isAssignableFrom(int.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_INT);
      } else if (type.isAssignableFrom(double.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_DOUBLE);
      } else if (type.isAssignableFrom(long.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_LONG);
      } else if (type.isAssignableFrom(boolean.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_BOOLEAN);
      } else if (type.isAssignableFrom(byte.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_BYTE);
      } else if (type.isAssignableFrom(char.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_CHAR);
      } else if (type.isAssignableFrom(short.class)) {
         currentArgument.setType(currentArgument.getType() | ARG_PRIMITIVE);
         currentArgument.setType(currentArgument.getType() | ARG_SHORT);
      }

      currentArgument.setPrimitiveSize(getPrimitiveSize(currentArgument.getType()));

      if (logger.isLoggable(Level.FINE)) {
         logger.fine("arg " + currentArgument.getName() + ", type=" + Integer.toHexString(currentArgument.getType())
             + ", primitiveSize=" + currentArgument.getPrimitiveSize());
      }
      return currentArgument;
   }

   /**
    * Fill the given {@link KernelArg} with array date in the given field.
    * @param entryPoint  entryPoint currently handled
    * @param kernel current kernel
    * @param field current kernel field
    * @param kernelArg current kernel arg (result type)
    * @param type field type
    * @return true if the array was successfully handled, false if an error has been encountered
    */
   private boolean handleArrayTypeKernelArg(Entrypoint entryPoint, Kernel kernel, Field field, KernelArg kernelArg, Class<?> type) {
      if (field.getAnnotation(Local.class) != null || kernelArg.getName().endsWith(Local.LOCAL_SUFFIX)) {
         kernelArg.setType(kernelArg.getType() | ARG_LOCAL);
      } else if ((field.getAnnotation(Constant.class) != null)
          || kernelArg.getName().endsWith(Constant.CONSTANT_SUFFIX)) {
         kernelArg.setType(kernelArg.getType() | ARG_CONSTANT);
      } else {
         kernelArg.setType(kernelArg.getType() | ARG_GLOBAL);
      }
      if (isExplicit()) {
         kernelArg.setType(kernelArg.getType() | ARG_EXPLICIT);
      }
      // for now, treat all write arrays as read-write, see bugzilla issue 4859
      // we might come up with a better solution later
      kernelArg.setType(kernelArg.getType()
          | (entryPoint.getArrayFieldAssignments().contains(field.getName()) ? (ARG_WRITE | ARG_READ) : 0));
      kernelArg.setType(kernelArg.getType()
          | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0));
      // args[i].type |= ARG_GLOBAL;


      if (type.getName().startsWith("[L")) {
         kernelArg.setType(kernelArg.getType()
             | (ARG_OBJ_ARRAY_STRUCT |
             ARG_WRITE |
             ARG_READ |
             ARG_APARAPI_BUFFER));

         if (logger.isLoggable(Level.FINE)) {
            logger.fine("tagging " + kernelArg.getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
         }
      } else if (type.getName().startsWith("[[")) {

         try {
            setMultiArrayType(kernel, kernelArg, type);
         } catch(AparapiException e) {
            logger.severe("failed to set kernel arguement " + kernelArg.getName() + ".  Aparapi only supports 2D and 3D arrays.");
            return false;
         }
      } else {

         kernelArg.setArray(null); // will get updated in updateKernelArrayRefs
         kernelArg.setType(kernelArg.getType() | ARG_ARRAY);

         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0));
         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0));
         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0));
         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0));
         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0));
         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0));
         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0));
         kernelArg.setType(kernelArg.getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0));

         // arrays whose length is used will have an int arg holding
         // the length as a kernel param
         if (entryPoint.getArrayFieldArrayLengthUsed().contains(kernelArg.getName())) {
            kernelArg.setType(kernelArg.getType() | ARG_ARRAYLENGTH);
         }

         if (type.getName().startsWith("[L")) {
            kernelArg.setType(kernelArg.getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ));
            if (logger.isLoggable(Level.FINE)) {
               logger.fine("tagging " + kernelArg.getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)");
            }
         }
      }
      return true;
   }


   private int getPrimitiveSize(int type) {
      if ((type & ARG_FLOAT) != 0) {
         return 4;
      } else if ((type & ARG_INT) != 0) {
         return 4;
      } else if ((type & ARG_BYTE) != 0) {
         return 1;
      } else if ((type & ARG_CHAR) != 0) {
         return 2;
      } else if ((type & ARG_BOOLEAN) != 0) {
         return 1;
      } else if ((type & ARG_SHORT) != 0) {
         return 2;
      } else if ((type & ARG_LONG) != 0) {
         return 8;
      } else if ((type & ARG_DOUBLE) != 0) {
         return 8;
      }
      return 0;
   }

   private void setMultiArrayType(Kernel kernel, KernelArg arg, Class<?> type) throws AparapiException {
      arg.setType(arg.getType() | (ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER));
      int numDims = 0;
      if(type.getName().startsWith("[[[[")) {
         throw new AparapiException("Aparapi only supports 2D and 3D arrays.");
      }
      arg.setType(arg.getType() | ARG_ARRAYLENGTH);
      while(type.getName().charAt(numDims) == '[') {
         numDims++;
      }
      Object buffer = new Object();
      try {
         buffer = arg.getField().get(kernel);
      } catch(IllegalAccessException e) {
         e.printStackTrace();
      }
      arg.setJavaBuffer(buffer);
      arg.setNumDims(numDims);
      Object subBuffer = buffer;
      int[] dims = new int[numDims];
      for(int i = 0; i < numDims-1; i++) {
         dims[i] = Array.getLength(subBuffer);
         subBuffer = Array.get(subBuffer, 0);
      }
      dims[numDims-1] = Array.getLength(subBuffer);
      arg.setDims(dims);

      if (subBuffer.getClass().isAssignableFrom(float[].class)) {
         arg.setType(arg.getType() | ARG_FLOAT);
      }
      if (subBuffer.getClass().isAssignableFrom(int[].class)) {
         arg.setType(arg.getType() | ARG_INT);
      }
      if (subBuffer.getClass().isAssignableFrom(boolean[].class)) {
         arg.setType(arg.getType() | ARG_BOOLEAN);
      }
      if (subBuffer.getClass().isAssignableFrom(byte[].class)) {
         arg.setType(arg.getType() | ARG_BYTE);
      }
      if (subBuffer.getClass().isAssignableFrom(char[].class)) {
         arg.setType(arg.getType() | ARG_CHAR);
      }
      if (subBuffer.getClass().isAssignableFrom(double[].class)) {
         arg.setType(arg.getType() | ARG_DOUBLE);
      }
      if (subBuffer.getClass().isAssignableFrom(long[].class)) {
         arg.setType(arg.getType() | ARG_LONG);
      }
      if (subBuffer.getClass().isAssignableFrom(short[].class)) {
         arg.setType(arg.getType() | ARG_SHORT);
      }
      int primitiveSize = getPrimitiveSize(arg.getType());
      int totalElements = 1;
      for(int i = 0; i < numDims; i++) {
         totalElements *= dims[i];
      }
      arg.setSizeInBytes(totalElements * primitiveSize);
   }

   public List<ProfileInfo> getProfileInfo(Kernel kernel) {
      return getProfileInfo(kernel.getClass());
   }

   public boolean hasProfileInfo(Class<? extends Kernel> kernelClass) {
      return kernelMappingMap.containsKey(kernelClass);
   }

   public List<ProfileInfo> getProfileInfo(Class<? extends Kernel> kernelClass) {
      KernelMapping kernelMapping = kernelMappingMap.get(kernelClass);
      if (kernelMapping == null) {
         throw new IllegalArgumentException("cannot find kernel for " + kernelClass.getName());
      }

      if (((getExecutionMode() == EXECUTION_MODE.GPU) || (getExecutionMode() == EXECUTION_MODE.CPU))) {
         // Only makes sense when we are using OpenCL
         return (getProfileInfoJNI(kernelMapping.kernelContextHandle));
      } else {
         return (null);
      }
   }

   private boolean explicit = false;

   public void setExplicit(boolean _explicit) {
      explicit = _explicit;
   }

   public boolean isExplicit() {
      return (explicit);
   }

   /**
    * Determine the time taken to convert bytecode to OpenCL for first Kernel.execute(range) call.
    *
    * @return The time spent preparing the kernel for execution using GPU
    *
    */
   public long getConversionTime() {
      return conversionTime;
   }

   /**
    * Determine the execution time of the previous Kernel.execute(range) call.
    *
    * @return The time spent executing the kernel (ms)
    *
    */
   public long getExecutionTime() {
      return executionTime;
   }

   /**
    * Determine the accumulated execution time of all previous Kernel.execute(range) calls.
    *
    * @return The accumulated time spent executing this kernel (ms)
    *
    */
   public long getAccumulatedExecutionTime() {
      return accumulatedExecutionTime;
   }

   public EXECUTION_MODE getExecutionMode() {
      return executionMode;
   }

   public void setExecutionMode(EXECUTION_MODE executionMode) {
      this.executionMode = executionMode;
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(long[] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(long[][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(long[][][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(double[] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(double[][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(double[][][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(float[] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(float[][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(float[][][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(int[] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(int[][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(int[][][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(byte[] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(byte[][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(byte[][][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(char[] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(char[][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(char[][][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(boolean[] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(boolean[][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed
    * @param array array to put
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner put(boolean[][][] array) {
      putRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(long[] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(long[][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(long[][][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array  array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(double[] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(double[][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(double[][][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(float[] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(float[][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(float[][][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(int[] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(int[][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(int[][][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(byte[] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(byte[][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(byte[][][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(char[] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(char[][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(char[][][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(boolean[] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(boolean[][] array) {
      getRaw(array);
      return (this);
   }

   /**
    * Enqueue a request to return this buffer from the GPU. This method blocks until the array is available.
    * @param array  array to get
    * @return This kernel so that we can use the 'fluent' style API
    */
   public KernelRunner get(boolean[][][] array) {
      getRaw(array);
      return (this);
   }


   /**
    * Enqueue a request to return this array from the GPU. This method blocks until the array is available.
    * <br/>
    * Note that <code>Kernel.put(type [])</code> calls will delegate to this call.
    * <br/>
    * Package public
    *
    * @param array
    *          It is assumed that this parameter is indeed an array (of int, float, short etc).
    *
    * @see #get(int[] arr)
    * @see #get(int[][] arr)
    * @see #get(int[][][] arr)
    * @see #get(float[] arr)
    * @see #get(float[][] arr)
    * @see #get(float[][][] arr)
    * @see #get(double[] arr)
    * @see #get(double[][] arr)
    * @see #get(double[][][] arr)
    * @see #get(long[] arr)
    * @see #get(long[][] arr)
    * @see #get(long[][][] arr)
    * @see #get(char[] arr)
    * @see #get(char[][] arr)
    * @see #get(char[][][] arr)
    * @see #get(boolean[] arr)
    * @see #get(boolean[][] arr)
    * @see #get(boolean[][][] arr)
    */
   private void getRaw(Object array) {
      if (explicit
            && ((getExecutionMode() == EXECUTION_MODE.GPU) || (getExecutionMode() == EXECUTION_MODE.CPU))) {
         // Only makes sense when we are using OpenCL
         getJNI(kernelRunnerContextHandle, array);
      }
   }

   /**
    * Tag this array so that it is explicitly enqueued before the kernel is executed. <br/>
    * Note that <code>Kernel.put(type [])</code> calls will delegate to this call. <br/>
    * Package public
    *
    * @param array
    *          It is assumed that this parameter is indeed an array (of int, float, short etc).
    * @see #put(int[] arr)
    * @see #put(int[][] arr)
    * @see #put(int[][][] arr)
    * @see #put(float[] arr)
    * @see #put(float[][] arr)
    * @see #put(float[][][] arr)
    * @see #put(double[] arr)
    * @see #put(double[][] arr)
    * @see #put(double[][][] arr)
    * @see #put(long[] arr)
    * @see #put(long[][] arr)
    * @see #put(long[][][] arr)
    * @see #put(char[] arr)
    * @see #put(char[][] arr)
    * @see #put(char[][][] arr)
    * @see #put(boolean[] arr)
    * @see #put(boolean[][] arr)
    * @see #put(boolean[][][] arr)
    */

   private void putRaw(Object array) {
      if (explicit
            && ((getExecutionMode() == EXECUTION_MODE.GPU) || (getExecutionMode() == EXECUTION_MODE.CPU))) {
         // Only makes sense when we are using OpenCL
         puts.add(array);
      }
   }

   /**
    * set possible fallback path for execution modes.
    * for example setExecutionFallbackPath(GPU,CPU,JTP) will try to use the GPU
    * if it fails it will fall back to OpenCL CPU and finally it will try JTP.
    */
   public void addExecutionModes(EXECUTION_MODE... platforms) {
      executionModes.addAll(Arrays.asList(platforms));
      currentMode = executionModes.iterator();
      executionMode = currentMode.next();
   }

   /**
    * @return is there another execution path we can try
    */
   public boolean hasNextExecutionMode() {
      return currentMode.hasNext();
   }

   /**
    * try the next execution path in the list if there aren't any more than give up
    */
   public void tryNextExecutionMode() {
      if (currentMode.hasNext()) {
         executionMode = currentMode.next();
      }
   }

   public void setFallbackExecutionMode() {
      executionMode = EXECUTION_MODE.getFallbackExecutionMode();
   }
}