Welcome to Hexo! This is your very first post. Check documentation for more info. If you get any problems when using Hexo, you can find the answer in troubleshooting or you can ask me on GitHub.
voidevent_callback(struct bufferevent *bev, short events, void *ctx) { structinfo *inf = ctx; structevbuffer *input = bufferevent_get_input(bev); int finished = 0;
if (events & BEV_EVENT_EOF) { size_t len = evbuffer_get_length(input); printf("Got a close from %s. We drained %lu bytes from it, " "and have %lu left.\n", inf->name, (unsignedlong)inf->total_drained, (unsignedlong)len); finished = 1; } if (events & BEV_EVENT_ERROR) { printf("Got an error from %s: %s\n", inf->name, evutil_socket_error_to_string(EVUTIL_SOCKET_ERROR())); finished = 1; } if (finished) { free(ctx); bufferevent_free(bev); } }
# 常用 CUDA 编译选项(按需启用) target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr> $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda> )
__global__ void axpy(float* x, const float* y, float a, int n) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { x[i] = a * x[i] + y[i]; } }
int main() { int n = 1 << 20; float a = 3.14f;
thrust::host_vector<float> hx(n), hy(n); for (int i = 0; i < n; ++i) { hx[i] = i * 0.001f; hy[i] = 1.0f; }
thrust::device_vector<float> dx = hx; thrust::device_vector<float> dy = hy;
axpy<<<256, 256>>>(thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dy.data()), a, n); cudaDeviceSynchronize();
__device__ __forceinline__ int my_atomic_add(int* dst, int val) { int old = *dst; int assumed; do { assumed = old; old = atomicCAS(dst, assumed, assumed + val); } while (assumed != old); return old; }
8.3 朴素并行求和(全局原子累加)
1 2 3 4 5 6
__global__ void parallel_sum(int* sum, const int* arr, int n) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { atomicAdd(sum, arr[i]); } }
9. 线程块与共享内存(Shared Memory)
9.1 核心概念
__shared__:块内共享内存(一个 block 内所有线程可见)
__syncthreads():块内同步屏障(必须保证同一 block 的线程都能到达)
共享内存常用于:
块内复用数据(减少 global memory 访问)
块内归约(reduce)
tile-based 计算(矩阵乘、卷积、图像算子)
9.2 块内归约:每块只做一次全局原子
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#include <cuda_runtime.h>
__global__ void reduce_sum(const int* arr, int n, int* out) { extern __shared__ int sdata[]; // 动态共享内存 int tid = threadIdx.x; int i = blockIdx.x * blockDim.x + tid;
sdata[tid] = (i < n) ? arr[i] : 0; __syncthreads();
for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) sdata[tid] += sdata[tid + s]; __syncthreads(); }
if (tid == 0) atomicAdd(out, sdata[0]); }
启动方式(动态共享内存大小):
1 2 3
int threads = 256; int blocks = (n + threads - 1) / threads; reduce_sum<<<blocks, threads, threads * sizeof(int)>>>(arr, n, out);
__global__ void mean3x3(const float* in, float* out, int H, int W) { // blockDim = (Bx, By) const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y;
// tile 尺寸:块大小 + halo(上下左右各1) const int Bx = blockDim.x; const int By = blockDim.y; extern __shared__ float tile[];
// tile 索引函数 auto t = [&](int ty, int tx) -> float& { return tile[ty * (Bx + 2) + tx]; };
// 对应 tile 坐标(+1 是为了留 halo) const int tx = threadIdx.x + 1; const int ty = threadIdx.y + 1;
// 读主区域 float v = 0.f; if (x < W && y < H) v = in[y * W + x]; t(ty, tx) = v;
// 读 halo(边界处做 clamp 或置零,这里用置零策略) if (threadIdx.x == 0) { float lv = (x > 0 && y < H) ? in[y * W + (x - 1)] : 0.f; t(ty, 0) = lv; } if (threadIdx.x == Bx - 1) { float rv = (x + 1 < W && y < H) ? in[y * W + (x + 1)] : 0.f; t(ty, Bx + 1) = rv; } if (threadIdx.y == 0) { float uv = (y > 0 && x < W) ? in[(y - 1) * W + x] : 0.f; t(0, tx) = uv; } if (threadIdx.y == By - 1) { float dv = (y + 1 < H && x < W) ? in[(y + 1) * W + x] : 0.f; t(By + 1, tx) = dv; }
if (x < W && y < H) { float sum = 0.f; sum += t(ty-1, tx-1); sum += t(ty-1, tx); sum += t(ty-1, tx+1); sum += t(ty, tx-1); sum += t(ty, tx); sum += t(ty, tx+1); sum += t(ty+1, tx-1); sum += t(ty+1, tx); sum += t(ty+1, tx+1); out[y * W + x] = sum / 9.f; } }
__global__ void scale(float* x, int n, float a) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { x[i] *= a; } }
int main() { const int n = 1 << 20; const size_t bytes = n * sizeof(float);
// pinned host memory float* h = nullptr; CUDA_CHECK(cudaMallocHost(&h, bytes));
// device memory float* d = nullptr; CUDA_CHECK(cudaMalloc(&d, bytes));
// init host for (int i = 0; i < n; ++i) h[i] = 1.0f;