CUDACodeGenerator.java example

Explorer
sx10-master
/*
 *  This file is part of the X10 project (http://x10-lang.org).
 *
 *  This file is licensed to You under the Eclipse Public License (EPL);
 *  You may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *      http://www.opensource.org/licenses/eclipse-1.0.php
 *
 *  (C) Copyright IBM Corporation 2006-2010.
 */

package x10cuda.visit;

import static x10cpp.visit.Emitter.mangled_non_method_name;
import static x10cpp.visit.SharedVarsMethods.CUDA_NATIVE_STRING;
import static x10cpp.visit.SharedVarsMethods.CPP_NATIVE_STRING;
import static x10cpp.visit.SharedVarsMethods.DESERIALIZATION_BUFFER;
import static x10cpp.visit.SharedVarsMethods.DESERIALIZE_METHOD;
import static x10cpp.visit.SharedVarsMethods.SERIALIZATION_BUFFER;
import static x10cpp.visit.SharedVarsMethods.SERIALIZATION_ID_FIELD;
import static x10cpp.visit.SharedVarsMethods.SERIALIZATION_MARKER;
import static x10cpp.visit.SharedVarsMethods.SERIALIZE_BODY_METHOD;
import static x10cpp.visit.SharedVarsMethods.THIS;
import static x10cpp.visit.SharedVarsMethods.SAVED_THIS;
import static x10cpp.visit.SharedVarsMethods.chevrons;
import static x10cpp.visit.SharedVarsMethods.make_ref;

import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;

import polyglot.ast.ArrayInit_c;
import polyglot.ast.Assert_c;
import polyglot.ast.Assign_c;
import polyglot.ast.Binary;
import polyglot.ast.Binary_c;
import polyglot.ast.Block;
import polyglot.ast.Block_c;
import polyglot.ast.BooleanLit_c;
import polyglot.ast.Branch_c;
import polyglot.ast.Call;
import polyglot.ast.CanonicalTypeNode;
import polyglot.ast.Case_c;
import polyglot.ast.Catch_c;
import polyglot.ast.CharLit_c;
import polyglot.ast.ClassBody_c;
import polyglot.ast.Conditional_c;
import polyglot.ast.ConstructorDecl_c;
import polyglot.ast.Do_c;
import polyglot.ast.Empty_c;
import polyglot.ast.Eval;
import polyglot.ast.Eval_c;
import polyglot.ast.Expr;
import polyglot.ast.FieldDecl_c;
import polyglot.ast.Field_c;
import polyglot.ast.FloatLit_c;
import polyglot.ast.For;
import polyglot.ast.ForInit;
import polyglot.ast.For_c;
import polyglot.ast.Formal;
import polyglot.ast.Formal_c;
import polyglot.ast.Id_c;
import polyglot.ast.If_c;
import polyglot.ast.Import_c;
import polyglot.ast.Initializer_c;
import polyglot.ast.IntLit;
import polyglot.ast.IntLit_c;
import polyglot.ast.Labeled_c;
import polyglot.ast.Local;
import polyglot.ast.LocalClassDecl_c;
import polyglot.ast.LocalDecl;
import polyglot.ast.LocalDecl_c;
import polyglot.ast.Local_c;
import polyglot.ast.MethodDecl_c;
import polyglot.ast.New_c;
import polyglot.ast.Node;
import polyglot.ast.NullLit_c;
import polyglot.ast.PackageNode_c;
import polyglot.ast.Receiver;
import polyglot.ast.Return_c;
import polyglot.ast.Stmt;
import polyglot.ast.StringLit_c;
import polyglot.ast.SwitchBlock_c;
import polyglot.ast.Switch_c;
import polyglot.ast.Throw_c;
import polyglot.ast.Try;
import polyglot.ast.Try_c;
import polyglot.ast.TypeNode;
import polyglot.ast.Unary_c;
import polyglot.ast.While_c;
import polyglot.frontend.Compiler;
import x10.ast.AssignPropertyCall_c;
import x10.ast.Closure;
import x10.ast.ClosureCall_c;
import x10.ast.Closure_c;
import x10.ast.ForLoop;
import x10.ast.ForLoop_c;
import x10.ast.ParExpr_c;
import x10.ast.PropertyDecl_c;
import x10.ast.StmtSeq;
import x10.ast.SubtypeTest_c;
import x10.ast.Tuple_c;
import x10.ast.TypeDecl_c;
import x10.ast.X10Binary_c;
import x10.ast.X10Call;
import x10.ast.X10Call_c;
import x10.ast.X10CanonicalTypeNode_c;
import x10.ast.X10Cast_c;
import x10.ast.X10ClassDecl;
import x10.ast.X10ClassDecl_c;
import x10.ast.X10Formal;
import x10.ast.X10Instanceof_c;
import x10.ast.X10Loop;
import x10.ast.X10Loop_c;
import x10.ast.X10MethodDecl_c;
import x10.ast.X10New_c;
import x10.ast.X10Special_c;
import x10.ast.X10Unary_c;
import x10.constraint.XEQV;
import x10.constraint.XEquals;
import x10.constraint.XFailure;
import x10.constraint.XFormula;
import x10.constraint.XLit;
import x10.constraint.XLocal;
import x10.constraint.XTerm;
import x10.constraint.XVar;
import x10.extension.X10Ext;
import x10.types.ConstrainedType;
import x10.types.MethodInstance;
import x10.types.X10ClassDef;
import x10.types.X10ClassType;
import x10.types.X10MethodDef;
import polyglot.types.TypeSystem;
import x10.types.constraints.CConstraint;
import x10cpp.X10CPPCompilerOptions;
import x10cpp.postcompiler.CXXCommandBuilder;
import x10cpp.postcompiler.PostCompileProperties;
import x10cpp.postcompiler.PrecompiledLibrary;
import x10cpp.types.X10CPPContext_c;
import x10cpp.visit.Emitter;
import x10cpp.visit.MessagePassingCodeGenerator;
import x10cpp.visit.SharedVarsMethods;
import x10cpp.visit.X10CPPTranslator;
import x10cuda.ast.CUDAKernel;
import x10cuda.types.CUDAData;
import x10cuda.types.SharedMem;
import x10cuda.types.X10CUDAContext_c;
import polyglot.main.Options;
import polyglot.main.Report;
import polyglot.types.Context;
import polyglot.types.Flags;
import polyglot.types.Name;
import polyglot.types.QName;
import polyglot.types.SemanticException;
import polyglot.types.Type;
import polyglot.types.Types;
import polyglot.types.VarInstance;
import polyglot.util.ErrorInfo;
import polyglot.util.ErrorQueue;
import polyglot.util.SimpleCodeWriter;
import polyglot.visit.NodeVisitor;
import polyglot.visit.PrettyPrinter;
import polyglot.visit.Translator;
import x10.util.ClassifiedStream;
import x10.util.StreamWrapper;

/**
 * Visitor that prettyprints an X10 AST to the CUDA subset of c++.
 * 
 * @author Dave Cunningham
 */

public class CUDACodeGenerator extends MessagePassingCodeGenerator {

	private static final String ANN_KERNEL = "x10.compiler.CUDA";
	private static final String ANN_DIRECT_PARAMS = "x10.compiler.CUDADirectParams";

	public CUDACodeGenerator(StreamWrapper sw, Translator tr) {
		super(sw, tr);
	}

	protected String[] getCurrentNativeStrings() {
		if (!generatingCUDACode())
			return new String[] { CPP_NATIVE_STRING };
		return new String[] { CUDA_NATIVE_STRING, CPP_NATIVE_STRING };
	}

	private X10CUDAContext_c context() {
		return (X10CUDAContext_c) tr.context();
	}

	private TypeSystem xts() {
		return  tr.typeSystem();
	}

	// defer to CUDAContext.cudaStream()
	private ClassifiedStream cudaKernelStream() {
		return context().cudaKernelStream(sw, tr.job());
	}

	private ClassifiedStream cudaClassBodyStream() {
		return context().cudaClassBodyStream(sw, tr.job());
	}

	private boolean generatingCUDACode() {
		return context().generatingCUDACode();
	}

	private void generatingCUDACode(boolean v) {
		context().generatingCUDACode(v);
	}

	// does the block have the annotation that denotes that it should be
	// split-compiled to cuda?
	private boolean blockIsKernel(Node n) {
		return n instanceof CUDAKernel;
	}

	// type from name
	private Type getType(String name) throws SemanticException {
		return xts().systemResolver().findOne(QName.make(name));
	}

	// does the block have the given annotation
	private boolean nodeHasAnnotation(Node n, String ann) {
		X10Ext ext = (X10Ext) n.ext();
		try {
			return !ext.annotationMatching(getType(ann)).isEmpty();
		} catch (SemanticException e) {
			assert false : e;
			return false; // in case asserts are off
		}
	}

	// does the block have the annotation that denotes that it should be
	// split-compiled to cuda?
	private boolean nodeHasCUDAAnnotation(Node n) {
		return nodeHasAnnotation(n, ANN_KERNEL);
	}
	
	private String env = "__env";

	@SuppressWarnings("serial")
	private static class Complaint extends RuntimeException {
	}

	private void complainIfNot(boolean cond, String exp, Node n, boolean except) {
		complainIfNot2(cond, "@CUDA Expected: " + exp, n, except);
	}

	private void complainIfNot2(boolean cond, String exp, Node n, boolean except) {
		if (!cond) {
			tr.job().compiler().errorQueue().enqueue(ErrorInfo.SEMANTIC_ERROR,
					exp, n.position());
			if (except)
				throw new Complaint();
		}
	}

	private void complainIfNot(boolean cond, String exp, Node n) {
		complainIfNot(cond, exp, n, true);
	}

	private void complainIfNot2(boolean cond, String exp, Node n) {
		complainIfNot2(cond, exp, n, true);
	}

	private Type arrayCargo(Type typ) {
		if (xts().isArray(typ)) {
			typ = typ.toClass();
			X10ClassType ctyp = (X10ClassType) typ;
			assert ctyp.typeArguments() != null && ctyp.typeArguments().size() == 1; // Array[T]
			return ctyp.typeArguments().get(0);
		}
		if (xts().isRemoteArray(typ)) {
			typ = typ.toClass();
			X10ClassType ctyp = (X10ClassType) typ;
			assert ctyp.typeArguments() != null && ctyp.typeArguments().size() == 1; // RemoteRef[Array[T]]
			Type type2 = ctyp.typeArguments().get(0);
			X10ClassType ctyp2 = (X10ClassType) typ;
			assert ctyp2.typeArguments() != null && ctyp2.typeArguments().size() == 1; // Array[T]
			return ctyp2.typeArguments().get(0);
		}
		return null;

	}

	private boolean isFloatArray(Type typ) {
		Type cargo = arrayCargo(typ);
		return cargo != null && cargo.isFloat();
	}

	private boolean isIntArray(Type typ) {
		Type cargo = arrayCargo(typ);
		return cargo != null && cargo.isInt();
	}

	String prependCUDAType(Type t, String rest) {
		String type = Emitter.translateType(t, true);

		if (isIntArray(t)) {
			type = "x10aux::cuda_array<x10_int> ";
		} else if (isFloatArray(t)) {
			type = "x10aux::cuda_array<x10_float> ";
		} else {
			type = type + " ";
		}

		return type + rest;
	}

	void handleKernel(Stmt b) {
		
		CUDAKernel cuda_kernel = context().cudaKernel();
		//System.out.println("Here is the kernel: "+cuda_kernel);
		
		
		String kernel_name = context().wrappingClosure();
		sw.write("/* block split-compiled to cuda as " + kernel_name + " */ ");

		ClassifiedStream out = cudaKernelStream();

		// environment (passed into kernel via pointer)
		generateStruct(kernel_name, out, context().kernelParams());

		out.forceNewline();

		boolean ptr = !cuda_kernel.directParams;
		// kernel (extern "C" to disable name-mangling which seems to be
		// inconsistent across cuda versions)
		out.write("extern \"C\" __global__ void " + kernel_name + "("
				+ kernel_name + "_env " + (ptr ? "*" : "") + env + ") {");
		out.newline(4);
		out.begin(0);

		if (ptr) {
			for (VarInstance<?> var : context().kernelParams()) {
				String name = var.name().toString();
				if (name.equals(THIS)) {
					name = SAVED_THIS;
				} else {
					name = Emitter.mangled_non_method_name(name);
				}
				out.write("__shared__ " + prependCUDAType(var.type(), name) + ";");
				out.newline();
			}
			out.write("if (threadIdx.x==0) {");
			out.newline(4);
			out.begin(0);
			for (VarInstance<?> var : context().kernelParams()) {
				String name = var.name().toString();
				if (name.equals(THIS)) {
					name = SAVED_THIS;
				} else {
					name = Emitter.mangled_non_method_name(name);
				}
				out.write(name + " = " + env + "->" + name + ";");
				out.newline();
			}
			out.end();
			out.newline();
			out.write("}");
			out.newline();
			out.write("__syncthreads(); // kernel parameters");
			out.newline();
			out.forceNewline();
		}

		sw.pushCurrentStream(out);
		try {
			cuda_kernel.cmem.generateCodeConstantMemory(sw, tr);
		} finally {
			sw.popCurrentStream();
		}

		sw.pushCurrentStream(out);
		try {
			cuda_kernel.shm.generateCodeSharedMem(sw, tr);
		} finally {
			sw.popCurrentStream();
		}

		out.write("__syncthreads(); // initialised shm"); out.newline();        
        out.forceNewline();                

		// body
		sw.pushCurrentStream(out);
        try {
        	Context context = tr.context();
        	cuda_kernel.blocksVar.addDecls(context);
        	cuda_kernel.threadsVar.addDecls(context);
        	if (cuda_kernel.autoBlocks != null)  cuda_kernel.autoBlocks.addDecls(context);
        	if (cuda_kernel.autoThreads != null) cuda_kernel.autoThreads.addDecls(context);
        	cuda_kernel.cmem.addDecls(context);
        	cuda_kernel.shm.addDecls(context);
        	super.visitAppropriate(b);
        } finally {
        	sw.popCurrentStream();
        }

		// end
		out.end();
		out.newline();
		out.write("} // " + kernel_name);
		out.newline();

		out.forceNewline();
	}

	private void generateStruct(String kernel_name, SimpleCodeWriter out, ArrayList<VarInstance<?>> vars) {
		out.write("struct " + kernel_name + "_env {");
		out.newline(4);
		out.begin(0);
		// emitter.printDeclarationList(out, context(),
		// context().kernelParams());
		for (VarInstance<?> var : vars) {
			String name = var.name().toString();
			if (name.equals(THIS)) {
				name = SAVED_THIS;
			} else {
				name = Emitter.mangled_non_method_name(name);
			}
			out.write(prependCUDAType(var.type(), name) + ";");
			out.newline();
		}
		out.end();
		out.newline();
		out.write("};");
		out.newline();
	}


	public void visit(Block_c b) {
		super.visit(b);
		try {
			if (blockIsKernel(b)) {
				final CUDAKernel cuda_kernel = (CUDAKernel)b;

				complainIfNot2(!generatingCUDACode(), "@CUDA kernels may not be nested.", b);
				
				context().cudaKernel(cuda_kernel);
				context().initKernelParams();
				context().established().cudaKernel(cuda_kernel);
				context().established().initKernelParams();
				
				generatingCUDACode(true);
				try {
					handleKernel(cuda_kernel.body());
				} finally {
					generatingCUDACode(false);
				}

				
			} else if (context().inCUDAFunction() && !generatingCUDACode()){
				generatingCUDACode(true);
				sw.pushCurrentStream(cudaClassBodyStream());
		        try {
		        	super.visitAppropriate(b);
		        } finally {
		        	sw.popCurrentStream();
					generatingCUDACode(false);
		        }
			}
		} catch (Complaint e) {
			// don't bother doing anything more with this kernel,
			// just try and continue with the code after
			// (note that we've already done the regular CPU code)
		}
	}

	public void visit(Closure_c n) {
		context().establishClosure();
		String last = context().wrappingClosure();
		X10ClassType hostClassType = (X10ClassType) n.closureDef().typeContainer().get();
		String nextHostClassName = Emitter.translate_mangled_FQN(hostClassType.fullName().toString(), "_");
		String next = getClosureName(nextHostClassName, context().closureId() + 1);
		context().wrappingClosure(next);
		try {
			super.visit(n);
		} finally {
			context().wrappingClosure(last);
		}
	}

	protected void generateClosureDeserializationIdDef(ClassifiedStream defn_s, String cnamet, List<Type> freeTypeParams, String hostClassName, Block block, int kind) {
		if (blockIsKernel(block)) {
			
			assert kind==1;

			TypeSystem xts = tr.typeSystem();
			boolean in_template_closure = freeTypeParams.size() > 0;
			if (in_template_closure)
				emitter.printTemplateSignature(freeTypeParams, defn_s);
			defn_s.write("const x10aux::serialization_id_t " + cnamet + "::"
					+ SharedVarsMethods.SERIALIZATION_ID_FIELD + " = ");
			defn_s.newline(4);
			String template = in_template_closure ? "template " : "";
			defn_s.write("x10aux::DeserializationDispatcher::addDeserializer("
					+ cnamet + "::" + template
					+ SharedVarsMethods.DESERIALIZE_METHOD
					+ chevrons("x10::lang::Reference") + ", "+closure_kind_strs[kind]+", " + cnamet
					+ "::" + template
					+ SharedVarsMethods.DESERIALIZE_CUDA_METHOD + ", " + cnamet
					+ "::" + template + SharedVarsMethods.POST_CUDA_METHOD
					+ ", " + "\"" + hostClassName + "\", \"" + cnamet
					+ "\");");
			defn_s.newline();
			defn_s.forceNewline();
		} else {
			super.generateClosureDeserializationIdDef(defn_s, cnamet,
					freeTypeParams, hostClassName, block, kind);
		}
	}

	protected void generateClosureSerializationFunctions(X10CPPContext_c c, String cnamet, StreamWrapper inc, Block block, List<VarInstance<?>> env2, List<VarInstance<?>> refs) {

		super.generateClosureSerializationFunctions(c, cnamet, inc, block, env2, refs);

		if (blockIsKernel(block)) {

			CUDAKernel cuda_kernel = ((CUDAKernel)block);			

			ArrayList<VarInstance<?>> env = context().kernelParams();

			if (env == null)
				return;

			generateStruct("__cuda", inc, env);

			inc.write("static void "
						+ SharedVarsMethods.POST_CUDA_METHOD
						+ "("
						+ DESERIALIZATION_BUFFER
						+ " &__buf, x10aux::place __gpu, size_t __blocks, size_t __threads, size_t __shm, size_t argc, char *argv, size_t cmemc, char *cmemv) {");
			inc.newline(4);
			inc.begin(0);

			inc.write("__cuda_env __env;");
			inc.newline();

			if (!cuda_kernel.directParams) {
				inc.write("x10_ulong __remote_env;");
				inc.newline();
				inc.write("::memcpy(&__remote_env, argv, sizeof (void*));");
				inc.newline();
				inc.write("x10aux::remote_free(__gpu, __remote_env);");
				inc.newline();
				// FIXME: any arrays referenced from the env are being leaked
				// here.
				// we need some way to record a copy of the contents of the
				// __env on the host
				// so that we do not have to fetch __remote_env back onto the
				// host
				// then we can free those arrays like in the else branch below
			} else {
				inc.write("::memcpy(&__env, argv, argc);");
				inc.newline();
				for (VarInstance<?> var : env) {
					Type t = var.type();
					String name = var.name().toString();
					if (isIntArray(t) || isFloatArray(t)) {
						if (!xts().isRemoteArray(t)) {
							inc.write("x10aux::remote_free(__gpu, (x10_ulong)(size_t)__env." + name + ".raw);");
						}
					}
					inc.newline();
				}

			}

			inc.end();
			inc.newline();
			inc.write("}");
			inc.newline();

			inc.forceNewline();

			inc.write("static void "+SharedVarsMethods.DESERIALIZE_CUDA_METHOD+"("+DESERIALIZATION_BUFFER+" &__buf, x10aux::place __gpu, size_t &__blocks, size_t &__threads, size_t &__shm, size_t &__argc, char *&__argv, size_t &__cmemc, char *&__cmemv) {");
			inc.newline(4);
			inc.begin(0);

			inc.write(make_ref(cnamet) + " __this = " + cnamet + "::" + DESERIALIZE_METHOD + "<" + cnamet + ">(__buf);");
			inc.newline();

			for (VarInstance<?> var : env) {
				Type t = var.type();
				String name = var.name().toString();
				inc.write(Emitter.translateType(t, true) + " " + name);
				if (cuda_kernel.autoBlocks != null
						&& var == cuda_kernel.autoBlocks.localDef().asInstance()) {
					inc.write(";");
				} else if (cuda_kernel.autoThreads != null
						&& var == cuda_kernel.autoThreads.localDef().asInstance()) {
					inc.write(";");
				} else {
					inc.write(" = __this->" + name + ";");
				}
				inc.newline();
			}

			inc.write("__shm = ");
			inc.begin(0);
			cuda_kernel.shm.generateSize(inc, tr);
			inc.write(";");
			inc.end();
			inc.newline();
			inc.write("x10aux::check_shm_size(__shm);");

			inc.write("__cmemc = ");
			inc.begin(0);
			cuda_kernel.cmem.generateSize(inc, tr);
			inc.write(";");
			inc.end();
			inc.newline();
			inc.write("x10aux::check_cmem_size(__cmemc);");
			
			cuda_kernel.cmem.generateHostCodeConstantMemory(inc, tr);

			// this is probably broken when only one is given.
			if (cuda_kernel.autoBlocks != null
					&& cuda_kernel.autoThreads != null) {
				String bname = cuda_kernel.autoBlocks.name().id().toString();
				String tname = cuda_kernel.autoThreads.name().id().toString();
				inc.write("x10aux::blocks_threads(__gpu, x10aux::DeserializationDispatcher::getMsgType(_serialization_id), __shm, "
								+ bname + ", " + tname + ");");
				inc.newline();
			}
			inc.write("__blocks = (");
			inc.begin(0);
			tr.print(null, cuda_kernel.blocks, inc);
			inc.write(")+1;");
			inc.end();
			inc.newline();

			inc.write("__threads = (");
			inc.begin(0);
			tr.print(null, cuda_kernel.threads, inc);
			inc.write(")+1;");
			inc.end();
			inc.newline();

			inc.write("__cuda_env __env;");
			inc.newline();

			for (VarInstance<?> var : env) {
				Type t = var.type();
				String name = var.name().toString();

				// String addr = "&(*"+name+")[0]"; // old way for rails
				String addr = "&" + name + "->FMGL(raw).raw()[0]";
				// String rr =
				// "x10aux::get_remote_ref_maybe_null("+name+".operator->())";
				// // old object model
				String rr = "&" + name + "->FMGL(rawData).raw()[0]";

				String ts = null;
				if (isIntArray(t)) {
					ts = "x10_int";
				} else if (isFloatArray(t)) {
					ts = "x10_float";
				}

				if (isIntArray(t) || isFloatArray(t)) {
					if (xts().isRemoteArray(t)) {
						inc.write("__env." + name + ".raw = (" + ts + "*)(size_t)" + rr + ";");
						inc.newline();
						inc.write("__env." + name + ".FMGL(size) = " + name + "->FMGL(size);");
						inc.newline();
					} else {
						String len = name + "->FMGL(size)";
						String sz = "sizeof(" + ts + ")*" + len;
						inc.write("__env." + name + ".raw = (" + ts + "*)(size_t)x10aux::remote_alloc(__gpu, " + sz + ");");
						inc.newline();
						inc.write("__env." + name + ".FMGL(size) = " + len + ";");
						inc.newline();
						inc.write("x10aux::cuda_put(__gpu, (x10_ulong) __env." + name + ".raw, " + addr + ", " + sz + ");");
					}
				} else {
					inc.write("__env." + name + " = " + name + ";");
				}
				inc.newline();
			}

			if (env.isEmpty()) {
				inc.write("__argc = 0;");
				inc.end();
				inc.newline();
			} else {
				if (cuda_kernel.directParams) {
					inc.write("memcpy(__argv, &__env, sizeof(__env));");
					inc.newline();
					inc.write("__argc = sizeof(__env);");
					inc.end();
					inc.newline();
				} else {
					inc.write("x10_ulong __remote_env = x10aux::remote_alloc(__gpu, sizeof(__env));");
					inc.newline();
					inc.write("x10aux::cuda_put(__gpu, __remote_env, &__env, sizeof(__env));");
					inc.newline();
					inc.write("::memcpy(__argv, &__remote_env, sizeof (void*));");
					inc.newline();
					inc.write("__argc = sizeof(void*);");
					inc.end();
					inc.newline();
				}
			}
			inc.write("}");
			inc.newline();
			inc.forceNewline();
		}
	}

	public void visit(New_c n) {
		complainIfNot2(!generatingCUDACode(), "New not allowed in @CUDA code.", n, false);
		super.visit(n);
	}

	@Override
	public void visit(Assert_c n) {
		complainIfNot2(!generatingCUDACode(),
				"Throwing exceptions not allowed in @CUDA code.", n, false);
		super.visit(n);
	}



	@Override
	public void visit(Catch_c n) {
		complainIfNot2(!generatingCUDACode(),
				"Catching exceptions not allowed in @CUDA code.", n, false);
		super.visit(n);
	}

	@Override
	public void visit(ClosureCall_c n) {
		complainIfNot2(!generatingCUDACode(),
				"Closure calls not allowed in @CUDA code.", n, false);
		super.visit(n);
	}

	@Override
	public void visit(Local_c n) {
		CUDAKernel cuda_kernel = context().cudaKernel();
		if (generatingCUDACode() && !inCUDAFunction()) {
			ClassifiedStream out = cudaKernelStream();
			Name ln = n.name().id();
			// HACK: Use localDef().name(), rather than name().id(), because the vars will have been renamed
			if (ln == cuda_kernel.blocksVar.localDef().name()) {
				out.write("blockIdx.x");
			} else if (ln == cuda_kernel.threadsVar.localDef().name()) {
				out.write("threadIdx.x");
			} else if (context().shmIterationVar()!=null && ln == context().shmIterationVar().localDef().name()) {
				out.write("__i");
			} else if (cuda_kernel.shm.has(ln)) {
				out.write(ln.toString());
			} else if (context().isKernelParam(ln)) {
				// it seems the post-compiler is not good at hoisting these
				// accesses so we do it ourselves
				String literal = constrainedToLiteral(n);
				if (literal!=null) {
					//System.out.println("Optimised kernel param: "+n+" --> "+literal);
					out.write(literal);
				} else {
					if (cuda_kernel.directParams) {
						out.write(env + "." + ln);
					} else {
						out.write(ln.toString());
					}
				}
			} else {
				String literal = constrainedToLiteral(n);
				if (literal!=null) {
					//System.out.println("Optimised local: "+n+" --> "+literal);
					out.write(literal);
				} else {
					super.visit(n);
				}
			}
		} else {
			// we end up here in the _deserialize_cuda function because
			// generatingKernel() is false
			Name ln = n.name().id();
			if (cuda_kernel == null) {
				// not even in _deserialize_cuda, just arbitrary host code
				super.visit(n);
			} else if (cuda_kernel.autoBlocks != null && ln == cuda_kernel.autoBlocks.name().id()) {
				sw.write(cuda_kernel.autoBlocks.name().id().toString());
			} else if (cuda_kernel.autoThreads != null && ln == cuda_kernel.autoThreads.name().id()) {
				sw.write(cuda_kernel.autoThreads.name().id().toString());
			} else {
				super.visit(n);
			}
		}
	}

	private boolean inCUDAFunction() {
		// TODO Auto-generated method stub
		return context().inCUDAFunction();
	}

	private String constrainedToLiteral(Local_c n) {
		//if (true) return null;
		if (!n.localInstance().def().flags().isFinal()) return null;
		if (!(n.type() instanceof ConstrainedType)) return null;
		ConstrainedType ct = (ConstrainedType) n.type();
		CConstraint cc = ct.getRealXClause();
		XVar local_self = Types.selfVarBinding(cc);
		if (local_self==null) return null;
		if (local_self instanceof XLit) return "/*"+n+":"+n.type()+"*/"+local_self.toString();
		// resolve to another variable, keep going
		CConstraint projected= context().constraintProjection(cc);
		if (! projected.consistent())
			return null;
		
		XVar closed_self = projected.bindingForVar(local_self);
		if (closed_self==null) return null;
		if (closed_self instanceof XLit) return "/*"+n+":"+n.type()+"*/"+closed_self.toString();
		return null;
	}

	@Override
	public void visit(Throw_c n) {
		complainIfNot2(!generatingCUDACode(),
				"Throwing exceptions not allowed in @CUDA code.", n, false);
		super.visit(n);
	}

	@Override
	public void visit(Try_c n) {
		complainIfNot2(!generatingCUDACode(),
				"Catching exceptions not allowed in @CUDA code.", n, false);
		super.visit(n);
	}

	@Override
	public void visit(X10ClassDecl_c n) {
		boolean v = context().firstKernel();
		context().firstKernel(true);

		X10ClassDef lastHostClass = context().wrappingClass();
		context().wrappingClass(n.classDef());
		
		try {
			super.visit(n);
		} finally {
			context().wrappingClass(lastHostClass);
		}
	}

	@Override
	public void visit(X10MethodDecl_c n) {
		if (nodeHasCUDAAnnotation(n)) {
			ClassifiedStream b = cudaClassBodyStream();
			Flags flags = n.flags().flags();
	        X10MethodDef def = n.methodDef();
	        MethodInstance mi = def.asInstance();

        	complainIfNot2(flags.isStatic(), "Currently we only support static @CUDA functions.", n);
        	complainIfNot2(context().wrappingClass().typeParameters().size() == 0, "Currently @CUDA functions cannot be in generic classes.", n);
			b.writeln("// "+n.toString());
			b.write("__device__ ");
	        sw.pushCurrentStream(b);
	        try {
	        	emitter.printHeader(n, sw, tr, mi.name().toString(), mi.returnType(), false, false);
	        } finally {
	        	sw.popCurrentStream();
	        }
		    context().inCUDAFunction(true);
		}
		super.visit(n);
	}

	@Override
	public void visit(X10Instanceof_c n) {
		complainIfNot2(!generatingCUDACode(),
				"Runtime types not available in @CUDA code.", n, false);
		super.visit(n);
	}
		   
	public static boolean postCompile(X10CPPCompilerOptions options, Compiler compiler, ErrorQueue eq) {
        if (options.post_compiler != null && !options.output_stdout) {
    		CXXCommandBuilder ccb = CXXCommandBuilder.getCXXCommandBuilder(
    				options,
    				X10CPPTranslator.loadX10RTProperties(options),
					X10CPPTranslator.loadSharedLibProperties(),
					eq);
    		
    	    for (String arch : ccb.getCUDAArchitectures()) {
    	        if (!postCompile(options, compiler, eq, arch, ccb)) return false;
    	    }       
        }
	    return true;
	}

	private static boolean postCompile(X10CPPCompilerOptions options, Compiler compiler, ErrorQueue eq, String arch, CXXCommandBuilder ccb) {
		Collection<String> compilationUnits = options.compilationUnits();
		for (String f : compilationUnits) {
			if (f.endsWith(".cu")) {
				ArrayList<String> nvccCmd = new ArrayList<String>();
				nvccCmd.add(ccb.getCUDAPostCompiler());
				for (String s : ccb.getCUDAPreFileArgs()) {
					nvccCmd.add(s);
				}
				nvccCmd.add("-arch="+arch);
				nvccCmd.add(f);
				nvccCmd.add("-o");
				nvccCmd.add(f.substring(0,f.length() - 3) + "_" + arch + ".cubin");
				if (!X10CPPTranslator.doPostCompile(options, eq, compilationUnits, nvccCmd.toArray(new String[nvccCmd.size()]), true)) {
					eq.enqueue(ErrorInfo.WARNING, "Found @CUDA annotation, but not compiling for GPU because nvcc could not be run (check your $PATH).");
					return true;
				}
			}
		}
		return true;
	}

} // end of CUDACodeGenerator

// vim:tabstop=4:shiftwidth=4:expandtab