/*
* This file is part of the X10 project (http://x10-lang.org).
*
* This file is licensed to You under the Eclipse Public License (EPL);
* You may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.opensource.org/licenses/eclipse-1.0.php
*
* (C) Copyright IBM Corporation 2006-2010.
*/
package x10cuda.visit;
import static x10cpp.visit.Emitter.mangled_non_method_name;
import static x10cpp.visit.SharedVarsMethods.CUDA_NATIVE_STRING;
import static x10cpp.visit.SharedVarsMethods.CPP_NATIVE_STRING;
import static x10cpp.visit.SharedVarsMethods.DESERIALIZATION_BUFFER;
import static x10cpp.visit.SharedVarsMethods.DESERIALIZE_METHOD;
import static x10cpp.visit.SharedVarsMethods.SERIALIZATION_BUFFER;
import static x10cpp.visit.SharedVarsMethods.SERIALIZATION_ID_FIELD;
import static x10cpp.visit.SharedVarsMethods.SERIALIZATION_MARKER;
import static x10cpp.visit.SharedVarsMethods.SERIALIZE_BODY_METHOD;
import static x10cpp.visit.SharedVarsMethods.THIS;
import static x10cpp.visit.SharedVarsMethods.SAVED_THIS;
import static x10cpp.visit.SharedVarsMethods.chevrons;
import static x10cpp.visit.SharedVarsMethods.make_ref;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import polyglot.ast.ArrayInit_c;
import polyglot.ast.Assert_c;
import polyglot.ast.Assign_c;
import polyglot.ast.Binary;
import polyglot.ast.Binary_c;
import polyglot.ast.Block;
import polyglot.ast.Block_c;
import polyglot.ast.BooleanLit_c;
import polyglot.ast.Branch_c;
import polyglot.ast.Call;
import polyglot.ast.CanonicalTypeNode;
import polyglot.ast.Case_c;
import polyglot.ast.Catch_c;
import polyglot.ast.CharLit_c;
import polyglot.ast.ClassBody_c;
import polyglot.ast.Conditional_c;
import polyglot.ast.ConstructorDecl_c;
import polyglot.ast.Do_c;
import polyglot.ast.Empty_c;
import polyglot.ast.Eval;
import polyglot.ast.Eval_c;
import polyglot.ast.Expr;
import polyglot.ast.FieldDecl_c;
import polyglot.ast.Field_c;
import polyglot.ast.FloatLit_c;
import polyglot.ast.For;
import polyglot.ast.ForInit;
import polyglot.ast.For_c;
import polyglot.ast.Formal;
import polyglot.ast.Formal_c;
import polyglot.ast.Id_c;
import polyglot.ast.If_c;
import polyglot.ast.Import_c;
import polyglot.ast.Initializer_c;
import polyglot.ast.IntLit;
import polyglot.ast.IntLit_c;
import polyglot.ast.Labeled_c;
import polyglot.ast.Local;
import polyglot.ast.LocalClassDecl_c;
import polyglot.ast.LocalDecl;
import polyglot.ast.LocalDecl_c;
import polyglot.ast.Local_c;
import polyglot.ast.MethodDecl_c;
import polyglot.ast.New_c;
import polyglot.ast.Node;
import polyglot.ast.NullLit_c;
import polyglot.ast.PackageNode_c;
import polyglot.ast.Receiver;
import polyglot.ast.Return_c;
import polyglot.ast.Stmt;
import polyglot.ast.StringLit_c;
import polyglot.ast.SwitchBlock_c;
import polyglot.ast.Switch_c;
import polyglot.ast.Throw_c;
import polyglot.ast.Try;
import polyglot.ast.Try_c;
import polyglot.ast.TypeNode;
import polyglot.ast.Unary_c;
import polyglot.ast.While_c;
import polyglot.frontend.Compiler;
import x10.ast.AssignPropertyCall_c;
import x10.ast.Closure;
import x10.ast.ClosureCall_c;
import x10.ast.Closure_c;
import x10.ast.ForLoop;
import x10.ast.ForLoop_c;
import x10.ast.ParExpr_c;
import x10.ast.PropertyDecl_c;
import x10.ast.StmtSeq;
import x10.ast.SubtypeTest_c;
import x10.ast.Tuple_c;
import x10.ast.TypeDecl_c;
import x10.ast.X10Binary_c;
import x10.ast.X10Call;
import x10.ast.X10Call_c;
import x10.ast.X10CanonicalTypeNode_c;
import x10.ast.X10Cast_c;
import x10.ast.X10ClassDecl;
import x10.ast.X10ClassDecl_c;
import x10.ast.X10Formal;
import x10.ast.X10Instanceof_c;
import x10.ast.X10Loop;
import x10.ast.X10Loop_c;
import x10.ast.X10MethodDecl_c;
import x10.ast.X10New_c;
import x10.ast.X10Special_c;
import x10.ast.X10Unary_c;
import x10.constraint.XEQV;
import x10.constraint.XEquals;
import x10.constraint.XFailure;
import x10.constraint.XFormula;
import x10.constraint.XLit;
import x10.constraint.XLocal;
import x10.constraint.XTerm;
import x10.constraint.XVar;
import x10.extension.X10Ext;
import x10.types.ConstrainedType;
import x10.types.MethodInstance;
import x10.types.X10ClassDef;
import x10.types.X10ClassType;
import x10.types.X10MethodDef;
import polyglot.types.TypeSystem;
import x10.types.constraints.CConstraint;
import x10cpp.X10CPPCompilerOptions;
import x10cpp.postcompiler.CXXCommandBuilder;
import x10cpp.postcompiler.PostCompileProperties;
import x10cpp.postcompiler.PrecompiledLibrary;
import x10cpp.types.X10CPPContext_c;
import x10cpp.visit.Emitter;
import x10cpp.visit.MessagePassingCodeGenerator;
import x10cpp.visit.SharedVarsMethods;
import x10cpp.visit.X10CPPTranslator;
import x10cuda.ast.CUDAKernel;
import x10cuda.types.CUDAData;
import x10cuda.types.SharedMem;
import x10cuda.types.X10CUDAContext_c;
import polyglot.main.Options;
import polyglot.main.Report;
import polyglot.types.Context;
import polyglot.types.Flags;
import polyglot.types.Name;
import polyglot.types.QName;
import polyglot.types.SemanticException;
import polyglot.types.Type;
import polyglot.types.Types;
import polyglot.types.VarInstance;
import polyglot.util.ErrorInfo;
import polyglot.util.ErrorQueue;
import polyglot.util.SimpleCodeWriter;
import polyglot.visit.NodeVisitor;
import polyglot.visit.PrettyPrinter;
import polyglot.visit.Translator;
import x10.util.ClassifiedStream;
import x10.util.StreamWrapper;
/**
* Visitor that prettyprints an X10 AST to the CUDA subset of c++.
*
* @author Dave Cunningham
*/
public class CUDACodeGenerator extends MessagePassingCodeGenerator {
private static final String ANN_KERNEL = "x10.compiler.CUDA";
private static final String ANN_DIRECT_PARAMS = "x10.compiler.CUDADirectParams";
public CUDACodeGenerator(StreamWrapper sw, Translator tr) {
super(sw, tr);
}
protected String[] getCurrentNativeStrings() {
if (!generatingCUDACode())
return new String[] { CPP_NATIVE_STRING };
return new String[] { CUDA_NATIVE_STRING, CPP_NATIVE_STRING };
}
private X10CUDAContext_c context() {
return (X10CUDAContext_c) tr.context();
}
private TypeSystem xts() {
return tr.typeSystem();
}
// defer to CUDAContext.cudaStream()
private ClassifiedStream cudaKernelStream() {
return context().cudaKernelStream(sw, tr.job());
}
private ClassifiedStream cudaClassBodyStream() {
return context().cudaClassBodyStream(sw, tr.job());
}
private boolean generatingCUDACode() {
return context().generatingCUDACode();
}
private void generatingCUDACode(boolean v) {
context().generatingCUDACode(v);
}
// does the block have the annotation that denotes that it should be
// split-compiled to cuda?
private boolean blockIsKernel(Node n) {
return n instanceof CUDAKernel;
}
// type from name
private Type getType(String name) throws SemanticException {
return xts().systemResolver().findOne(QName.make(name));
}
// does the block have the given annotation
private boolean nodeHasAnnotation(Node n, String ann) {
X10Ext ext = (X10Ext) n.ext();
try {
return !ext.annotationMatching(getType(ann)).isEmpty();
} catch (SemanticException e) {
assert false : e;
return false; // in case asserts are off
}
}
// does the block have the annotation that denotes that it should be
// split-compiled to cuda?
private boolean nodeHasCUDAAnnotation(Node n) {
return nodeHasAnnotation(n, ANN_KERNEL);
}
private String env = "__env";
@SuppressWarnings("serial")
private static class Complaint extends RuntimeException {
}
private void complainIfNot(boolean cond, String exp, Node n, boolean except) {
complainIfNot2(cond, "@CUDA Expected: " + exp, n, except);
}
private void complainIfNot2(boolean cond, String exp, Node n, boolean except) {
if (!cond) {
tr.job().compiler().errorQueue().enqueue(ErrorInfo.SEMANTIC_ERROR,
exp, n.position());
if (except)
throw new Complaint();
}
}
private void complainIfNot(boolean cond, String exp, Node n) {
complainIfNot(cond, exp, n, true);
}
private void complainIfNot2(boolean cond, String exp, Node n) {
complainIfNot2(cond, exp, n, true);
}
private Type arrayCargo(Type typ) {
if (xts().isArray(typ)) {
typ = typ.toClass();
X10ClassType ctyp = (X10ClassType) typ;
assert ctyp.typeArguments() != null && ctyp.typeArguments().size() == 1; // Array[T]
return ctyp.typeArguments().get(0);
}
if (xts().isRemoteArray(typ)) {
typ = typ.toClass();
X10ClassType ctyp = (X10ClassType) typ;
assert ctyp.typeArguments() != null && ctyp.typeArguments().size() == 1; // RemoteRef[Array[T]]
Type type2 = ctyp.typeArguments().get(0);
X10ClassType ctyp2 = (X10ClassType) typ;
assert ctyp2.typeArguments() != null && ctyp2.typeArguments().size() == 1; // Array[T]
return ctyp2.typeArguments().get(0);
}
return null;
}
private boolean isFloatArray(Type typ) {
Type cargo = arrayCargo(typ);
return cargo != null && cargo.isFloat();
}
private boolean isIntArray(Type typ) {
Type cargo = arrayCargo(typ);
return cargo != null && cargo.isInt();
}
String prependCUDAType(Type t, String rest) {
String type = Emitter.translateType(t, true);
if (isIntArray(t)) {
type = "x10aux::cuda_array<x10_int> ";
} else if (isFloatArray(t)) {
type = "x10aux::cuda_array<x10_float> ";
} else {
type = type + " ";
}
return type + rest;
}
void handleKernel(Stmt b) {
CUDAKernel cuda_kernel = context().cudaKernel();
//System.out.println("Here is the kernel: "+cuda_kernel);
String kernel_name = context().wrappingClosure();
sw.write("/* block split-compiled to cuda as " + kernel_name + " */ ");
ClassifiedStream out = cudaKernelStream();
// environment (passed into kernel via pointer)
generateStruct(kernel_name, out, context().kernelParams());
out.forceNewline();
boolean ptr = !cuda_kernel.directParams;
// kernel (extern "C" to disable name-mangling which seems to be
// inconsistent across cuda versions)
out.write("extern \"C\" __global__ void " + kernel_name + "("
+ kernel_name + "_env " + (ptr ? "*" : "") + env + ") {");
out.newline(4);
out.begin(0);
if (ptr) {
for (VarInstance<?> var : context().kernelParams()) {
String name = var.name().toString();
if (name.equals(THIS)) {
name = SAVED_THIS;
} else {
name = Emitter.mangled_non_method_name(name);
}
out.write("__shared__ " + prependCUDAType(var.type(), name) + ";");
out.newline();
}
out.write("if (threadIdx.x==0) {");
out.newline(4);
out.begin(0);
for (VarInstance<?> var : context().kernelParams()) {
String name = var.name().toString();
if (name.equals(THIS)) {
name = SAVED_THIS;
} else {
name = Emitter.mangled_non_method_name(name);
}
out.write(name + " = " + env + "->" + name + ";");
out.newline();
}
out.end();
out.newline();
out.write("}");
out.newline();
out.write("__syncthreads(); // kernel parameters");
out.newline();
out.forceNewline();
}
sw.pushCurrentStream(out);
try {
cuda_kernel.cmem.generateCodeConstantMemory(sw, tr);
} finally {
sw.popCurrentStream();
}
sw.pushCurrentStream(out);
try {
cuda_kernel.shm.generateCodeSharedMem(sw, tr);
} finally {
sw.popCurrentStream();
}
out.write("__syncthreads(); // initialised shm"); out.newline();
out.forceNewline();
// body
sw.pushCurrentStream(out);
try {
Context context = tr.context();
cuda_kernel.blocksVar.addDecls(context);
cuda_kernel.threadsVar.addDecls(context);
if (cuda_kernel.autoBlocks != null) cuda_kernel.autoBlocks.addDecls(context);
if (cuda_kernel.autoThreads != null) cuda_kernel.autoThreads.addDecls(context);
cuda_kernel.cmem.addDecls(context);
cuda_kernel.shm.addDecls(context);
super.visitAppropriate(b);
} finally {
sw.popCurrentStream();
}
// end
out.end();
out.newline();
out.write("} // " + kernel_name);
out.newline();
out.forceNewline();
}
private void generateStruct(String kernel_name, SimpleCodeWriter out, ArrayList<VarInstance<?>> vars) {
out.write("struct " + kernel_name + "_env {");
out.newline(4);
out.begin(0);
// emitter.printDeclarationList(out, context(),
// context().kernelParams());
for (VarInstance<?> var : vars) {
String name = var.name().toString();
if (name.equals(THIS)) {
name = SAVED_THIS;
} else {
name = Emitter.mangled_non_method_name(name);
}
out.write(prependCUDAType(var.type(), name) + ";");
out.newline();
}
out.end();
out.newline();
out.write("};");
out.newline();
}
public void visit(Block_c b) {
super.visit(b);
try {
if (blockIsKernel(b)) {
final CUDAKernel cuda_kernel = (CUDAKernel)b;
complainIfNot2(!generatingCUDACode(), "@CUDA kernels may not be nested.", b);
context().cudaKernel(cuda_kernel);
context().initKernelParams();
context().established().cudaKernel(cuda_kernel);
context().established().initKernelParams();
generatingCUDACode(true);
try {
handleKernel(cuda_kernel.body());
} finally {
generatingCUDACode(false);
}
} else if (context().inCUDAFunction() && !generatingCUDACode()){
generatingCUDACode(true);
sw.pushCurrentStream(cudaClassBodyStream());
try {
super.visitAppropriate(b);
} finally {
sw.popCurrentStream();
generatingCUDACode(false);
}
}
} catch (Complaint e) {
// don't bother doing anything more with this kernel,
// just try and continue with the code after
// (note that we've already done the regular CPU code)
}
}
public void visit(Closure_c n) {
context().establishClosure();
String last = context().wrappingClosure();
X10ClassType hostClassType = (X10ClassType) n.closureDef().typeContainer().get();
String nextHostClassName = Emitter.translate_mangled_FQN(hostClassType.fullName().toString(), "_");
String next = getClosureName(nextHostClassName, context().closureId() + 1);
context().wrappingClosure(next);
try {
super.visit(n);
} finally {
context().wrappingClosure(last);
}
}
protected void generateClosureDeserializationIdDef(ClassifiedStream defn_s, String cnamet, List<Type> freeTypeParams, String hostClassName, Block block, int kind) {
if (blockIsKernel(block)) {
assert kind==1;
TypeSystem xts = tr.typeSystem();
boolean in_template_closure = freeTypeParams.size() > 0;
if (in_template_closure)
emitter.printTemplateSignature(freeTypeParams, defn_s);
defn_s.write("const x10aux::serialization_id_t " + cnamet + "::"
+ SharedVarsMethods.SERIALIZATION_ID_FIELD + " = ");
defn_s.newline(4);
String template = in_template_closure ? "template " : "";
defn_s.write("x10aux::DeserializationDispatcher::addDeserializer("
+ cnamet + "::" + template
+ SharedVarsMethods.DESERIALIZE_METHOD
+ chevrons("x10::lang::Reference") + ", "+closure_kind_strs[kind]+", " + cnamet
+ "::" + template
+ SharedVarsMethods.DESERIALIZE_CUDA_METHOD + ", " + cnamet
+ "::" + template + SharedVarsMethods.POST_CUDA_METHOD
+ ", " + "\"" + hostClassName + "\", \"" + cnamet
+ "\");");
defn_s.newline();
defn_s.forceNewline();
} else {
super.generateClosureDeserializationIdDef(defn_s, cnamet,
freeTypeParams, hostClassName, block, kind);
}
}
protected void generateClosureSerializationFunctions(X10CPPContext_c c, String cnamet, StreamWrapper inc, Block block, List<VarInstance<?>> env2, List<VarInstance<?>> refs) {
super.generateClosureSerializationFunctions(c, cnamet, inc, block, env2, refs);
if (blockIsKernel(block)) {
CUDAKernel cuda_kernel = ((CUDAKernel)block);
ArrayList<VarInstance<?>> env = context().kernelParams();
if (env == null)
return;
generateStruct("__cuda", inc, env);
inc.write("static void "
+ SharedVarsMethods.POST_CUDA_METHOD
+ "("
+ DESERIALIZATION_BUFFER
+ " &__buf, x10aux::place __gpu, size_t __blocks, size_t __threads, size_t __shm, size_t argc, char *argv, size_t cmemc, char *cmemv) {");
inc.newline(4);
inc.begin(0);
inc.write("__cuda_env __env;");
inc.newline();
if (!cuda_kernel.directParams) {
inc.write("x10_ulong __remote_env;");
inc.newline();
inc.write("::memcpy(&__remote_env, argv, sizeof (void*));");
inc.newline();
inc.write("x10aux::remote_free(__gpu, __remote_env);");
inc.newline();
// FIXME: any arrays referenced from the env are being leaked
// here.
// we need some way to record a copy of the contents of the
// __env on the host
// so that we do not have to fetch __remote_env back onto the
// host
// then we can free those arrays like in the else branch below
} else {
inc.write("::memcpy(&__env, argv, argc);");
inc.newline();
for (VarInstance<?> var : env) {
Type t = var.type();
String name = var.name().toString();
if (isIntArray(t) || isFloatArray(t)) {
if (!xts().isRemoteArray(t)) {
inc.write("x10aux::remote_free(__gpu, (x10_ulong)(size_t)__env." + name + ".raw);");
}
}
inc.newline();
}
}
inc.end();
inc.newline();
inc.write("}");
inc.newline();
inc.forceNewline();
inc.write("static void "+SharedVarsMethods.DESERIALIZE_CUDA_METHOD+"("+DESERIALIZATION_BUFFER+" &__buf, x10aux::place __gpu, size_t &__blocks, size_t &__threads, size_t &__shm, size_t &__argc, char *&__argv, size_t &__cmemc, char *&__cmemv) {");
inc.newline(4);
inc.begin(0);
inc.write(make_ref(cnamet) + " __this = " + cnamet + "::" + DESERIALIZE_METHOD + "<" + cnamet + ">(__buf);");
inc.newline();
for (VarInstance<?> var : env) {
Type t = var.type();
String name = var.name().toString();
inc.write(Emitter.translateType(t, true) + " " + name);
if (cuda_kernel.autoBlocks != null
&& var == cuda_kernel.autoBlocks.localDef().asInstance()) {
inc.write(";");
} else if (cuda_kernel.autoThreads != null
&& var == cuda_kernel.autoThreads.localDef().asInstance()) {
inc.write(";");
} else {
inc.write(" = __this->" + name + ";");
}
inc.newline();
}
inc.write("__shm = ");
inc.begin(0);
cuda_kernel.shm.generateSize(inc, tr);
inc.write(";");
inc.end();
inc.newline();
inc.write("x10aux::check_shm_size(__shm);");
inc.write("__cmemc = ");
inc.begin(0);
cuda_kernel.cmem.generateSize(inc, tr);
inc.write(";");
inc.end();
inc.newline();
inc.write("x10aux::check_cmem_size(__cmemc);");
cuda_kernel.cmem.generateHostCodeConstantMemory(inc, tr);
// this is probably broken when only one is given.
if (cuda_kernel.autoBlocks != null
&& cuda_kernel.autoThreads != null) {
String bname = cuda_kernel.autoBlocks.name().id().toString();
String tname = cuda_kernel.autoThreads.name().id().toString();
inc.write("x10aux::blocks_threads(__gpu, x10aux::DeserializationDispatcher::getMsgType(_serialization_id), __shm, "
+ bname + ", " + tname + ");");
inc.newline();
}
inc.write("__blocks = (");
inc.begin(0);
tr.print(null, cuda_kernel.blocks, inc);
inc.write(")+1;");
inc.end();
inc.newline();
inc.write("__threads = (");
inc.begin(0);
tr.print(null, cuda_kernel.threads, inc);
inc.write(")+1;");
inc.end();
inc.newline();
inc.write("__cuda_env __env;");
inc.newline();
for (VarInstance<?> var : env) {
Type t = var.type();
String name = var.name().toString();
// String addr = "&(*"+name+")[0]"; // old way for rails
String addr = "&" + name + "->FMGL(raw).raw()[0]";
// String rr =
// "x10aux::get_remote_ref_maybe_null("+name+".operator->())";
// // old object model
String rr = "&" + name + "->FMGL(rawData).raw()[0]";
String ts = null;
if (isIntArray(t)) {
ts = "x10_int";
} else if (isFloatArray(t)) {
ts = "x10_float";
}
if (isIntArray(t) || isFloatArray(t)) {
if (xts().isRemoteArray(t)) {
inc.write("__env." + name + ".raw = (" + ts + "*)(size_t)" + rr + ";");
inc.newline();
inc.write("__env." + name + ".FMGL(size) = " + name + "->FMGL(size);");
inc.newline();
} else {
String len = name + "->FMGL(size)";
String sz = "sizeof(" + ts + ")*" + len;
inc.write("__env." + name + ".raw = (" + ts + "*)(size_t)x10aux::remote_alloc(__gpu, " + sz + ");");
inc.newline();
inc.write("__env." + name + ".FMGL(size) = " + len + ";");
inc.newline();
inc.write("x10aux::cuda_put(__gpu, (x10_ulong) __env." + name + ".raw, " + addr + ", " + sz + ");");
}
} else {
inc.write("__env." + name + " = " + name + ";");
}
inc.newline();
}
if (env.isEmpty()) {
inc.write("__argc = 0;");
inc.end();
inc.newline();
} else {
if (cuda_kernel.directParams) {
inc.write("memcpy(__argv, &__env, sizeof(__env));");
inc.newline();
inc.write("__argc = sizeof(__env);");
inc.end();
inc.newline();
} else {
inc.write("x10_ulong __remote_env = x10aux::remote_alloc(__gpu, sizeof(__env));");
inc.newline();
inc.write("x10aux::cuda_put(__gpu, __remote_env, &__env, sizeof(__env));");
inc.newline();
inc.write("::memcpy(__argv, &__remote_env, sizeof (void*));");
inc.newline();
inc.write("__argc = sizeof(void*);");
inc.end();
inc.newline();
}
}
inc.write("}");
inc.newline();
inc.forceNewline();
}
}
public void visit(New_c n) {
complainIfNot2(!generatingCUDACode(), "New not allowed in @CUDA code.", n, false);
super.visit(n);
}
@Override
public void visit(Assert_c n) {
complainIfNot2(!generatingCUDACode(),
"Throwing exceptions not allowed in @CUDA code.", n, false);
super.visit(n);
}
@Override
public void visit(Catch_c n) {
complainIfNot2(!generatingCUDACode(),
"Catching exceptions not allowed in @CUDA code.", n, false);
super.visit(n);
}
@Override
public void visit(ClosureCall_c n) {
complainIfNot2(!generatingCUDACode(),
"Closure calls not allowed in @CUDA code.", n, false);
super.visit(n);
}
@Override
public void visit(Local_c n) {
CUDAKernel cuda_kernel = context().cudaKernel();
if (generatingCUDACode() && !inCUDAFunction()) {
ClassifiedStream out = cudaKernelStream();
Name ln = n.name().id();
// HACK: Use localDef().name(), rather than name().id(), because the vars will have been renamed
if (ln == cuda_kernel.blocksVar.localDef().name()) {
out.write("blockIdx.x");
} else if (ln == cuda_kernel.threadsVar.localDef().name()) {
out.write("threadIdx.x");
} else if (context().shmIterationVar()!=null && ln == context().shmIterationVar().localDef().name()) {
out.write("__i");
} else if (cuda_kernel.shm.has(ln)) {
out.write(ln.toString());
} else if (context().isKernelParam(ln)) {
// it seems the post-compiler is not good at hoisting these
// accesses so we do it ourselves
String literal = constrainedToLiteral(n);
if (literal!=null) {
//System.out.println("Optimised kernel param: "+n+" --> "+literal);
out.write(literal);
} else {
if (cuda_kernel.directParams) {
out.write(env + "." + ln);
} else {
out.write(ln.toString());
}
}
} else {
String literal = constrainedToLiteral(n);
if (literal!=null) {
//System.out.println("Optimised local: "+n+" --> "+literal);
out.write(literal);
} else {
super.visit(n);
}
}
} else {
// we end up here in the _deserialize_cuda function because
// generatingKernel() is false
Name ln = n.name().id();
if (cuda_kernel == null) {
// not even in _deserialize_cuda, just arbitrary host code
super.visit(n);
} else if (cuda_kernel.autoBlocks != null && ln == cuda_kernel.autoBlocks.name().id()) {
sw.write(cuda_kernel.autoBlocks.name().id().toString());
} else if (cuda_kernel.autoThreads != null && ln == cuda_kernel.autoThreads.name().id()) {
sw.write(cuda_kernel.autoThreads.name().id().toString());
} else {
super.visit(n);
}
}
}
private boolean inCUDAFunction() {
// TODO Auto-generated method stub
return context().inCUDAFunction();
}
private String constrainedToLiteral(Local_c n) {
//if (true) return null;
if (!n.localInstance().def().flags().isFinal()) return null;
if (!(n.type() instanceof ConstrainedType)) return null;
ConstrainedType ct = (ConstrainedType) n.type();
CConstraint cc = ct.getRealXClause();
XVar local_self = Types.selfVarBinding(cc);
if (local_self==null) return null;
if (local_self instanceof XLit) return "/*"+n+":"+n.type()+"*/"+local_self.toString();
// resolve to another variable, keep going
CConstraint projected= context().constraintProjection(cc);
if (! projected.consistent())
return null;
XVar closed_self = projected.bindingForVar(local_self);
if (closed_self==null) return null;
if (closed_self instanceof XLit) return "/*"+n+":"+n.type()+"*/"+closed_self.toString();
return null;
}
@Override
public void visit(Throw_c n) {
complainIfNot2(!generatingCUDACode(),
"Throwing exceptions not allowed in @CUDA code.", n, false);
super.visit(n);
}
@Override
public void visit(Try_c n) {
complainIfNot2(!generatingCUDACode(),
"Catching exceptions not allowed in @CUDA code.", n, false);
super.visit(n);
}
@Override
public void visit(X10ClassDecl_c n) {
boolean v = context().firstKernel();
context().firstKernel(true);
X10ClassDef lastHostClass = context().wrappingClass();
context().wrappingClass(n.classDef());
try {
super.visit(n);
} finally {
context().wrappingClass(lastHostClass);
}
}
@Override
public void visit(X10MethodDecl_c n) {
if (nodeHasCUDAAnnotation(n)) {
ClassifiedStream b = cudaClassBodyStream();
Flags flags = n.flags().flags();
X10MethodDef def = n.methodDef();
MethodInstance mi = def.asInstance();
complainIfNot2(flags.isStatic(), "Currently we only support static @CUDA functions.", n);
complainIfNot2(context().wrappingClass().typeParameters().size() == 0, "Currently @CUDA functions cannot be in generic classes.", n);
b.writeln("// "+n.toString());
b.write("__device__ ");
sw.pushCurrentStream(b);
try {
emitter.printHeader(n, sw, tr, mi.name().toString(), mi.returnType(), false, false);
} finally {
sw.popCurrentStream();
}
context().inCUDAFunction(true);
}
super.visit(n);
}
@Override
public void visit(X10Instanceof_c n) {
complainIfNot2(!generatingCUDACode(),
"Runtime types not available in @CUDA code.", n, false);
super.visit(n);
}
public static boolean postCompile(X10CPPCompilerOptions options, Compiler compiler, ErrorQueue eq) {
if (options.post_compiler != null && !options.output_stdout) {
CXXCommandBuilder ccb = CXXCommandBuilder.getCXXCommandBuilder(
options,
X10CPPTranslator.loadX10RTProperties(options),
X10CPPTranslator.loadSharedLibProperties(),
eq);
for (String arch : ccb.getCUDAArchitectures()) {
if (!postCompile(options, compiler, eq, arch, ccb)) return false;
}
}
return true;
}
private static boolean postCompile(X10CPPCompilerOptions options, Compiler compiler, ErrorQueue eq, String arch, CXXCommandBuilder ccb) {
Collection<String> compilationUnits = options.compilationUnits();
for (String f : compilationUnits) {
if (f.endsWith(".cu")) {
ArrayList<String> nvccCmd = new ArrayList<String>();
nvccCmd.add(ccb.getCUDAPostCompiler());
for (String s : ccb.getCUDAPreFileArgs()) {
nvccCmd.add(s);
}
nvccCmd.add("-arch="+arch);
nvccCmd.add(f);
nvccCmd.add("-o");
nvccCmd.add(f.substring(0,f.length() - 3) + "_" + arch + ".cubin");
if (!X10CPPTranslator.doPostCompile(options, eq, compilationUnits, nvccCmd.toArray(new String[nvccCmd.size()]), true)) {
eq.enqueue(ErrorInfo.WARNING, "Found @CUDA annotation, but not compiling for GPU because nvcc could not be run (check your $PATH).");
return true;
}
}
}
return true;
}
} // end of CUDACodeGenerator
// vim:tabstop=4:shiftwidth=4:expandtab