99 static __device__ __forceinline__
void transform(InIt beg, InIt end, OutIt out, UnOp op)
103 OutIt o = out + (t - beg);
105 for(; t < end; t += STRIDE, o += STRIDE)
110 static __device__ __forceinline__
void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
115 OutIt o = out + (t1 - beg1);
117 for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
122 static __device__ __forceinline__
void reduce(
volatile T* buffer, BinOp op)
127 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
128 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
129 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
130 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
134 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
135 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
136 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
137 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
138 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
139 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
144 static __device__ __forceinline__ T
reduce(
volatile T* buffer, T init, BinOp op)
147 T val = buffer[tid] = init;
150 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
151 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
152 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
153 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
157 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
158 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
159 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
160 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
161 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
162 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
169 static __device__ __forceinline__
void reduce_n(T* data,
unsigned int n, BinOp op)
176 for (
unsigned int i = sft + ftid; i < n; i += sft)
177 data[ftid] = op(data[ftid], data[i]);
186 unsigned int half = n/2;
189 data[ftid] = op(data[ftid], data[n - ftid - 1]);