并行算法与GPU编程备忘录

发布时间：2023-08-08 09:30

文章目录

- Pthread多线程
- - 图像翻转
  - 生产者消费者
  - - 信号量
    - 条件变量
  - 计算圆周率
  - - 计算框架
    - 3种计算方法
  - 计算e值
- CUDA编程
- - 矢量求和
  - - 基本框架
    - 核函数
  - 矩阵转置
  - - 基本框架
    - 核函数
  - 规约计算
  - - 基本框架
    - 核函数
  - 矩阵相乘
  - - 基本框架
    - 核函数

Pthread多线程

图像翻转

因为代码太多了，因此只放线程函数核心代码

int n;  // 水平翻转时 n为ip.Vpixels 垂直翻转时为ip.Hpixels  
int rank = 1;  // 当前为第1个线程
int cur_n = n / thread_count;  //当前线程所需要处理的行/列数
int start = rank * cur_n;  // 从哪一行/列开始处理
int end = start + cur_n;  // 到哪一行/列处理完毕
// 垂直翻转(上下)
for(int col = start;col<end;col+=3){
    for(int row = 0;row<ip.Vpixels/2;row++){
        swap( img[row][col], img[ip.Vpixels - row -1][col] );
        swap( img[row][col+1], img[ip.Vpixels - row -1][col+1] );
        swap( img[row][col+2], img[ip.Vpixels - row -1][col+2] );
    }
}

// 水平翻转(左右)
for(row = start;row<end;row++){
    for(col = 0;col<ip.Hpixels*3/2;col+=3){
        swap( img[row][col], img[row][ip.Hpixel*3 - col -3] );
        swap( img[row][col+1], img[row][ip.Hpixel*3 - col -2] );
        swap( img[row][col+2], img[row][ip.Hpixel*3 - col -1] );

    }
}

生产者消费者

信号量

int t = 0;  // 消费者减t，生产者加t
sem_t empty,full;  // 信号量
pthread_mutex_t mutex;  // 互斥量

int main(){
    pthread_t producer;
    pthread_t consumer;
    // 初始化互斥量
    pthread_mutex_init(&mutex,NULL);  
    // 初始化信号量
    // 第2个参数代表几个进程共享该互斥量，0代表多线程共享
    sem_init(empty,0,10);  // empty信号量为10
    sem_init(full,0,0);  // full信号量为0
    // 生产者消费者函数内循环之间的时间间隔
    int producer_time = 2;
    int consumer_time = 3;
    // 创建线程
    pthread_create(
        &producer,  // 指明哪个线程
        NULL,
        producer_function,  // 线程执行什么函数
        &producer_time  // 传给函数的参数
    );
    pthread_create(
        &consumer,
        NULL,
        consumer_function,
        &consumer_time
    );
    // 等待线程结束
    pthread_join(producer,NULL);
    pthread_join(consumer,NULL);
  	// 销毁互斥量
    pthread_mutex_destroy(&mutex);
    return 0;
}

void* producer_function(void *time){
    int time = *(int*) time;  // 时间间隔
    while(true){
        sem_wait(&empty);  // empty信号量减1
        pthread_mutex_lock(&mutex);  // 加锁
        t++;  // 生产
        printf(\"生产\");  // 随便写个printf意思意思
        pthread_mutex_unlock(&mutex);  // 解锁
        sem_post(&full);  //full信号量加1
        sleep(time);
    }
}

void* consumer_function(void *time){
    int time = *(int*) time;
    while(true){
        sem_wait(&full);
        pthread_mutex_lock(&mutex);
        t--;
        printf(\"消费\");
        pthread_mutex_unlock(&mutex);
        sem_post(&empty);
        sleep(time);
    }
}

条件变量

int t = 0;  // 消费者减t，生产者加t
pthread_mutex_t mutex;  // 互斥量
pthread_cond_t cond_full;  // 非满条件变量
pthread_cond_t cond_empty;  // 非空条件变量

int main(){
    pthread_t producer;
    pthread_t consumer;
    // 初始化互斥量
    pthread_mutex_init(&mutex,NULL);  
    // 初始化条件变量
    pthread_cond_init(&cond_full,NULL);
    pthread_cond_init(&cond_empty,NULL);
    // 创建线程
    pthread_create(
        &producer,  // 指明哪个线程
        NULL,
        producer_function,  // 线程执行什么函数
        NULL  // 传给函数的参数
    );
    pthread_create(
        &consumer,
        NULL,
        consumer_function,
        NULL
    );
    // 等待线程结束
    pthread_join(producer,NULL);
    pthread_join(consumer,NULL);
  	// 销毁
    pthread_mutex_destroy(&mutex);
    pthread_cond_destroy(&cond_empty);
    pthread_cond_destroy(&cond_full);
    return 0;
}

void* producer_function(){
    while(true){
        pthread_mutex_lock(&mutex);  // 加锁
        // 当生产10个，认为满了
        while(t>10){
            // 将mutex解锁
            // 当收到pthread_cond_signal(&cond_full)时,mutex加锁并向下运行
            pthread_cond_wait(&cond_full,&mutex);
        }
        t++;  // 生产
        printf(\"生产\");  // 随便写个printf意思意思
        pthread_mutex_unlock(&mutex);  // 解锁
        pthread_cond_signal(&cond_empty);
        sleep(5);
    }
}

void* consumer_function(){
    while(true){
        pthread_mutex_lock(&mutex);
        while(t==0){
            pthread_cond_wait(&cond_empty,&mutex);
        }
        t--;
        printf(\"消费\");
        pthread_mutex_unlock(&mutex);
        pthread_cond_signal(&cond_full);
        sleep(5);
    }
}

计算圆周率

计算框架

// 全局变量
long long n = 100;  // 该数越大，计算精度越高
int thread_count = 32;  // 线程数目
double sum = 0;  // 结果
pthread_mutex_t mutex;  // 互斥量(为了锁住sum)

//每个线程要执行的函数
void* thread_function(void* rank){
    int rank = *(int *) rank;  // 第rank个线程
    double my_sum = 0;  // 该线程的计算结果
    long long cur_n = n / thread_count;  // 处理多少数据
    long long first = cur_n * rank;  // 第一个数据的索引
    long long last = first + cur_n;  //最后一个数据的索引
  
  
  	/***************************
  	此处为计算圆周率的核心代码
  	圆周率的计算有多种方法，见下方“3种计算方法”
  	将其中任何一种方法套用在此处即可
  	****************************/

    // 加锁累加，解锁
    pthread_mutex_lock(&mutex);
    sum += my_sum;
    pthread_mutex_unlock(&mutex);

    return NULL;
}

int main(){
    // 创建线程ID
    pthread_t thread_ID[thread_count];  
  	// 初始化互斥量
    pthread_mutex_init(&mutex,NULL);
    // 每个线程函数thread_function的参数
    int value[thread_count];  
    for(int i = 0; i<thread_count; i++){
        value[i] = i;
    }
    
    // 创建线程 
    for(int i = 0; i<thread_count; i++){
        pthread_create(
            &thread_ID[i],  // 第i个线程
            NULL,
            thread_funtion,  // 线程函数 
            &value[i]  // 线程函数参数
            );
    }

    // 等待线程结束
    for(int i = 0; i<thread_count; i++){
        pthread_join(thread_ID[i], NULL);
    }
    // 销毁互斥量
    pthread_mutex_destroy(&mutex);
    //结果
    printf(sum);
}

3种计算方法

积分法

    for(long long i = first; i<last; i++){
        my_sum += 4 / (1 + ((i + 0.5) / n )*((i + 0.5) / n )) / n;
    }

概率法

    for(long long i = first; i<last; i++){
        double x = rand()%n/n;
        double y = rand()%n/n;
        if(x*x + y*y <= 1){
            my_sum++; // 投中的数目
        }
    }

最终的结果是投中的次数 / 投的总数，此处略去了两者相除的操作。

幂级数法

    double factor;
    if(first%2 == 0){
        factor = 1.0;
    }else{
        factor = -1.0;
    }
    for(long long i = first; i<last; i++, factor=-factor){
        my_sum += factor/(2*n-1);
    }

计算e值

与计算圆周率基本相同，只是计算公式不同而已。

//全局变量
int thread_count = 100;  // 线程总数
pthread_mutex_t mutex;  // 互斥量(为了锁住e变量)
int n = 100;  // 这个数越大，计算精度越高
double e = 0;  // 存放最终结果

int main(){
    // 创建线程
    pthread_t thread_ID[thread_count];
    //初始化mutex
	pthread_mutex_init(&mutex,NULL);
    // 每个线程函数thread_function的参数
    int value[thread_count];
    for(int i = 0;i<thread_count;i++){
        value[i] = i;
    }
    // 创建线程 
    for(int i = 0;i<thread_count;i++){
        pthread_create(
            &thread_ID[i],
            NULL,
            thread_function,
            &value[i]
        );
    }
    // 等待线程结束
    for(int i = 0;i<thread_count;i++){
        pthread_join(thread_ID[i],NULL);
    }
    // 销毁互斥量
    pthread_mutex_destroy(&mutex);
    //结果
    printf(e);
}
//每个线程要执行的函数
void* thread_function(void* rank){
    double my_result = 0;  // 该线程的计算结果
    int rank = (int*) rank;  // 第rank个线程
    int my_n = n / thread_count;  // 处理多少数据
    int first = my_n * rank;  // 第一个数据的索引
    int last = first + my_n;  //最后一个数据的索引
    // 计算公式
    for(int i = first;i<last;i++){
        my_result += 1/(double*)f(i);
    }
    // 加锁累加，解锁
    thread_mutex_lock(&mutex);
    e += my_result;
    thread_mutex_unlock(&mutex);
    return NULL;
}
// 计算x的阶乘
int f(int x){
    if(x<1) return 1;
    int result = 1;
    for(int i = 1;i<x;i++){
        result *= i;
    }
    return result;
}

CUDA编程

矢量求和

基本框架

const int n = 2048*2048;
const int block = 256;
const int grid = 512;
const int size = n * sizeof(int);
int main(){
    // 定义变量
    int *a,*b,*c;
    int *d_a,*d_b,*d_c;

    // 分配内存
    a = (int*)malloc(size);
    b = (int*)malloc(size);
    c = (int*)malloc(size);
    cudaMalloc((void*)&d_a,size);
    cudaMalloc((void*)&d_b,size);
    cudaMalloc((void*)&d_c,size);

    // 赋值
    for(int i = 0; i<N; i++){
        a[i] = 1;
        b[i] = 2;
    }
    cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice);

    //计算
    cuda_add<<<grid,block>>>(d_a,d_b,d_c);

    //取出结果
    cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost);

    //打印结果
    printf(c);
    
    //释放内存
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(b); cudaFree(c);
    return 0;
}

核函数

普通方法

__global__ cuda_function(int *a, int *b, int *c){
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if(index < n){
        c[index] = a[index] + b[index];
    }
}

跨网格的方法

//核函数
__global__ cuda_add(int *a,int *b,int *c){
    int index = blockDim.x * blockIdx.x + threadIdx.x;  // 索引
    int thread_sum = blockDim.x * gridDim.x;  // 该网格中的线程总数
    for(int i = index; i<n; i+=thread_sum){
        c[i] = a[i] + b[i];
    }
}

矩阵转置

基本框架

const int width = 32;
const int n = width*width;
const int size = n*sizeof(int);
const int block = 16;
const int grid = 2;
int main(){
    // 定义变量
    int *a,*b;
    int *d_a,*d_b;

    // 分配内存
    a = (int*)malloc(size);
    b = (int*)malloc(size);
    cudaMemcpy((void**)&d_a,size);
    cudaMemcpy((void**)&d_b,size);

    // 赋值
    for(int i = 0; i<width; i++){
        for(int j = 0; j<width;j++){
            a[i][j] = i + j;
        }
    }
    cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice);

    // 计算
    cuda_function<<<grid,block>>>(d_a,d_b);
    
    // 取值
    cudaMemcpy(b,d_b,size,cudaMemcpyDeviceToHost);
    print(*b);

    // 清除内存
    free(a); free(b);
    cudaFree(d_a); cudaFree(d_b);
}

核函数

普通做法

__global__ void cuda_function(int *a,int *b){
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    if(row < width && col < width){
        b[row * width + col] = a[col * width + row];
    }
}

共享内存做法

__global__ void cuda_function(int *a, int *b){
    __shared__ int s_a[width][width];
    int s_row = threadIdx.y;
    int s_col = threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    //将a中属于该块的数据，存放在共享内存中
    if(row < width && col < width){
        s_a[s_row][s_col] = a[row * width + col];
    }
    // 等待该block中所有的线程完成上述处理
    _syncThreads();  
    // 转置操作
    b[row * width + col] = s_a[s_col][s_row];
}

规约计算

基本框架

const int n = 1<<29;
const int size = n*sizeof(int);
const int tile = 1024;  // 一个线程块处理多少个数据
const int grid = (n+tile-1)/tile;
const int block = tile;
long long total_sum = 0;
int main(void){
    //host变量
    int *h_data = nullptr; // 存放所有数据
    int *h_local_sum = nullptr;  // h_local_sum[0]代表第一个线程块的结果
    
    //device变量
    int *d_data = nullptr;
    int *d_local_sum = nullptr;

    //分配host内存
    h_data = (int*)malloc(size);
    h_local_sum = (int*)malloc((int)grid.x * sizeof(int));

    //分配device内存
    cudaMalloc((void**)&d_data,size);
    cudaMalloc((void**)&d_local_sum,(int)grid.x * sizeof(int));

    //初始化host变量
    for(int i = 0;i<n;i++){
        h_data[i] = int(10*sin(0.02*3.14*i));
    }

    //数据拷贝到device
    cudaMemcpy(d_data,h_data,size,cudaMemcpyHostToDevice);

    //核函数
    reduceNeighbored<<<grid,block>>>(d_data,d_local_sum,n);

    //数据拷贝回Host
    cudaMemcpy(h_local_sum,d_local_sum,(int)grid.x*sizeof(int),cudaMemcpyDeviceToHost);
    
    //同步
    cudaDeviceSynchronize();

    //线性求和
    for(int i = 0;i < int(grid.x);i++){
        total_sum += h_local_sum[i];
    }
    
    return total_sum;
}

核函数

__global__ void reduceNeighbored(int *d_data,int *d_local_sum,int N){
    int tid = threadIdx.x;
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    int *data = d_data + blockIdx.x * blockDim.x;  // 该核函数负责的第一个数
    if(index>N) return;
  
  	/*****************
  	此处为两种数据配对方式
  	见下方代码块，将其套入此处都能成立
    ******************/

    if(tid == 0){
        d_local_sum[blockIdx.x] = data[0];
    }
}

相邻配对

// 第一种：相邻配对,会导致内存访问的不连续
for(int stride = 1; stride<blockDim.x; stride*=2){
    //这个判断条件不好理解，最好手算分析
    if(tid % (stride*2)==0){
        data[threadIdx.x] += data[threadIdx.x + stride];
    }
    // 当所有线程都执行到这一步时，再往下执行
    _syncthreads(); 
}

交错配对

// 第二种：交错配对
for(int stride = blockDim.x/2; stride>0; stride/=2){
    if(tid<stride){
        data[tid] = data[tid + stride];
    }
    _syncThreads();
}

矩阵相乘

基本框架

// 矩阵相乘
const int tile = 32  // 块边长
const int width = 256;  // 矩阵的边长
const int n = width * width;  // 矩阵的大小
const int size = n * sizeof(float);  // 矩阵的内存大小
void main(){
    //定义变量
    float *a, *b, *c;
    float *d_a, *d_b, *d_c;
    // 分配内存
    a = (float*)Malloc(size);
    b = (float*)Malloc(size);
    c = (float*)Malloc(size);
    cudaMalloc(&d_a,size);
    cudaMalloc(&d_b,size);
    cudaMalloc(&d_c,size);
    // 初始化 a b （略）
    ……
    // 复制到device中
    cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice);
    // 执行核函数
    dim3 block(tile,tile);
    dim3 grid(width/tile,width/tile);
    cuda_function<<<grid,block>>>(d_a,d_b,d_c,width);
    // 返回结果
    cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost);
    printf(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(a); free(b); free(c);
}

核函数

无共享内存版

__global__ void cuda_function(float *a, float *b, float *c){
    int row = blockDim.y * blockIdx.y + threadIdx.y;  // 结果存放在c中的row行
    int col = blockDim.x * blockIdx.x + threadIdx.x;  // 结果存放在c中的col列
    for(int i = 0;i<width;i++){
        c[row * width + col] += a[row * width + i] * b[i * width + col];
    }
}

共享内存版

// 共享内存做法
__global__ void cuda_function(float *a,float *b,float *c){
  	// 共享内存用于存储该block中a、b的数据
  	// 方便该block中所有的thread能够访问到
    __shared__ float s_a[tile][tile];
    __shared__ float s_b[tile][tile];
  	// s_a、s_b的索引
    int s_row = threadIdx.y;  
    int s_col = threadIdx.x;
    // a的第row行，b的第col列，各自相乘的和，存放在
    // c的第row行col列，即c[row * width + col]
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
  	// 遍历grid中的某一行或一列block，k代表第k个block
    for(int k = 0; k<width/tile; k++){  
        // 将a、b的数据存入共享内存s_a、s_b
        int k_col = k * tile + threadIdx.x;  // 第k个block中的第k_col列
        int k_row = k * tile + threadIdx.y;  // 第k个block中的第k_row行
        s_a[s_row][s_col] = a[row * width + k_col];
        s_b[s_row][s_col] = b[k_row * width + col]
        // 等待block内所有线程都成功执行完上述操作
        _syncThreads();
        // 计算
        float sum = 0;
        for(int i = 0; i<tile; i++){
            sum += s_a[s_row][i] * s_b[i][s_col];
        }
        c[row * width + col] = sum;
        // 等待block内的所有线程计算完毕
        _syncThreads();
    }
}

并行算法与GPU编程备忘录

文章目录

Pthread多线程

图像翻转

生产者消费者

信号量

条件变量

计算圆周率

计算框架

3种计算方法

计算e值

CUDA编程

矢量求和

基本框架

核函数

矩阵转置

基本框架

核函数

规约计算

基本框架

核函数

矩阵相乘

基本框架

核函数

相关推荐