3 "// Generated by LLVM NVPTX Back-End\n"
10 " // .globl Overlay_Cuda\n"
12 ".visible .entry Overlay_Cuda(\n"
13 " .param .u32 Overlay_Cuda_param_0,\n"
14 " .param .u32 Overlay_Cuda_param_1,\n"
15 " .param .u64 Overlay_Cuda_param_2,\n"
16 " .param .u32 Overlay_Cuda_param_3,\n"
17 " .param .u64 Overlay_Cuda_param_4,\n"
18 " .param .u32 Overlay_Cuda_param_5,\n"
19 " .param .u32 Overlay_Cuda_param_6,\n"
20 " .param .u32 Overlay_Cuda_param_7,\n"
21 " .param .u64 Overlay_Cuda_param_8,\n"
22 " .param .u32 Overlay_Cuda_param_9,\n"
23 " .param .u32 Overlay_Cuda_param_10,\n"
24 " .param .u32 Overlay_Cuda_param_11\n"
27 " .reg .pred %p<9>;\n"
28 " .reg .b16 %rs<5>;\n"
29 " .reg .f32 %f<12>;\n"
30 " .reg .b32 %r<27>;\n"
31 " .reg .b64 %rd<13>;\n"
33 " ld.param.u32 %r6, [Overlay_Cuda_param_1];\n"
34 " ld.param.u32 %r5, [Overlay_Cuda_param_0];\n"
35 " // begin inline asm\n"
36 " mov.u32 %r12, %ctaid.x;\n"
37 " // end inline asm\n"
38 " // begin inline asm\n"
39 " mov.u32 %r13, %ctaid.y;\n"
40 " // end inline asm\n"
41 " ld.param.u32 %r18, [Overlay_Cuda_param_6];\n"
42 " // begin inline asm\n"
43 " mov.u32 %r14, %ntid.x;\n"
44 " // end inline asm\n"
45 " ld.param.u32 %r19, [Overlay_Cuda_param_7];\n"
46 " // begin inline asm\n"
47 " mov.u32 %r15, %ntid.y;\n"
48 " // end inline asm\n"
49 " // begin inline asm\n"
50 " mov.u32 %r16, %tid.x;\n"
51 " // end inline asm\n"
52 " // begin inline asm\n"
53 " mov.u32 %r17, %tid.y;\n"
54 " // end inline asm\n"
55 " mad.lo.s32 %r1, %r14, %r12, %r16;\n"
56 " mad.lo.s32 %r2, %r15, %r13, %r17;\n"
57 " add.s32 %r20, %r18, %r5;\n"
58 " setp.ge.s32 %p1, %r1, %r20;\n"
59 " add.s32 %r21, %r19, %r6;\n"
60 " setp.ge.s32 %p2, %r2, %r21;\n"
61 " or.pred %p3, %p1, %p2;\n"
62 " setp.lt.s32 %p4, %r1, %r5;\n"
63 " or.pred %p5, %p4, %p3;\n"
64 " setp.lt.s32 %p6, %r2, %r6;\n"
65 " or.pred %p7, %p6, %p5;\n"
67 " ld.param.u32 %r9, [Overlay_Cuda_param_9];\n"
68 " ld.param.u32 %r8, [Overlay_Cuda_param_5];\n"
69 " ld.param.u32 %r7, [Overlay_Cuda_param_3];\n"
70 " ld.param.u64 %rd6, [Overlay_Cuda_param_4];\n"
71 " cvta.to.global.u64 %rd2, %rd6;\n"
72 " ld.param.u64 %rd7, [Overlay_Cuda_param_2];\n"
73 " cvta.to.global.u64 %rd3, %rd7;\n"
74 " sub.s32 %r3, %r1, %r5;\n"
75 " sub.s32 %r4, %r2, %r6;\n"
76 " setp.eq.s32 %p8, %r9, 0;\n"
77 " mov.f32 %f3, 0f3F800000;\n"
78 " mov.f32 %f11, %f3;\n"
80 " ld.param.u32 %r11, [Overlay_Cuda_param_11];\n"
81 " ld.param.u32 %r10, [Overlay_Cuda_param_10];\n"
82 " ld.param.u64 %rd5, [Overlay_Cuda_param_8];\n"
83 " cvta.to.global.u64 %rd1, %rd5;\n"
84 " mul.lo.s32 %r22, %r3, %r10;\n"
85 " mul.lo.s32 %r23, %r11, %r9;\n"
86 " mad.lo.s32 %r24, %r23, %r4, %r22;\n"
87 " cvt.s64.s32 %rd8, %r24;\n"
88 " add.s64 %rd4, %rd1, %rd8;\n"
89 " ld.global.u8 %rs1, [%rd4];\n"
90 " cvt.rn.f32.u16 %f4, %rs1;\n"
91 " div.rn.f32 %f11, %f4, 0f437F0000;\n"
93 " mad.lo.s32 %r25, %r4, %r8, %r3;\n"
94 " cvt.s64.s32 %rd9, %r25;\n"
95 " add.s64 %rd10, %rd2, %rd9;\n"
96 " ld.global.u8 %rs2, [%rd10];\n"
97 " cvt.rn.f32.u16 %f5, %rs2;\n"
98 " sub.f32 %f7, %f3, %f11;\n"
99 " mad.lo.s32 %r26, %r2, %r7, %r1;\n"
100 " cvt.s64.s32 %rd11, %r26;\n"
101 " add.s64 %rd12, %rd3, %rd11;\n"
102 " ld.global.u8 %rs3, [%rd12];\n"
103 " cvt.rn.f32.u16 %f8, %rs3;\n"
104 " mul.f32 %f9, %f7, %f8;\n"
105 " fma.rn.f32 %f10, %f11, %f5, %f9;\n"
106 " cvt.rzi.u16.f32 %rs4, %f10;\n"
107 " st.global.u8 [%rd12], %rs4;\n"
const char vf_overlay_cuda_ptx[]