Standard C Code

	#include <stdio.h>
	#defime SIZE 1024

	void VectorAdd(int a, int b, int *c, int n){
	int i;
	for(i=0; i<n; ++i)
	c[i] = a[i] + b[i];
	}
	int main(){
	int a, b, *c;
	a = (int )malloc(SIZEsizeof(int));
	b = (int )malloc(SIZEsizeof(int));
	c = (int )malloc(SIZEsizeof(int));
	for(int i=0; i<SIZE; ++i){
	a[i] = i;
	b[i] = i;
	c[i] = 0;
	}
	VectorAdd(a, b, c, SIZE)
	for(int i=0; i<10; ++i)
	printf("c[%d] = &d\n", i, c[i]);
	free(a);
	free(b);
	free(c);
	return 0;
	}

view raw cpuExample1.cpp hosted with ❤ by GitHub

C with CUDA extensions

호스트, 디바이스 메모리 할당 및 초기화

호스트에서 디바이스로 메모리 카피

커널 실행

디바이스에서 호스트로 메모리 카피(결과값 전달)

호스트, 디바이스 메모리 할당 예제

	#include "device_launch_parameters.h"
	#include <cuda_runtime.h>
	#include <stdlib.h>
	#include <stdio.h>
	#define SIZE 1024

	// __global__을 통해서 커널임을 표시한다. host에서 호출된다.
	__global__ void VectorAdd(int a, int b, int *c, int n){
	// 수많은 스레드가 동시에 처리한다.
	// 따라서 threadIdx(스레드 인덱스)를 통해서 스레드들을 구별한다.
	int i = threadIdx.x;

	printf("threadIdx.x : %d, n : %d\n", i, n);

	for (i = 0; i < n; i++) {
	c[i] = a[i] + b[i];
	printf("%d = %d + %d\n", c[i], a[i], b[i]);
	}
	}

	int main(){
	int a, b, *c;
	int d_a, d_b, *d_c;

	// 호스트의 메모리에 할당한다.
	a = (int )malloc(SIZEsizeof(int));
	b = (int )malloc(SIZEsizeof(int));
	c = (int )malloc(SIZEsizeof(int));

	// cudaMalloc(destination, number of byte)로 device의 메모리를 할당한다.
	cudaMalloc(&d_a, SIZE*sizeof(int));
	cudaMalloc(&d_b, SIZE*sizeof(int));
	cudaMalloc(&d_c, SIZE*sizeof(int));

	// 초기화
	for (int i = 0; i<SIZE; ++i)
	{
	a[i] = i;
	b[i] = i;
	c[i] = 0;
	}

	// cudaMemcpy(destination, source, number of byte, cudaMemcpyHostToDevice)로 호스트에서 디바이스로 메모리를 카피한다.
	cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);

	// 함수 호출을 위해서 새로운 신텍스 요소를 추가할 필요가 있다.
	// 첫번째 parameter는 블럭의 수이다. 예제에서는 스레드 블럭이 하나이다.
	// SIZE는 1024개의 스레드를 의미한다.
	VectorAdd << < 1, SIZE >> >(d_a, d_b, d_c, SIZE);

	//cudaMemcpy(source, destination, number of byte, cudaMemDeviceToHost)로 디바이스의 메모리(연산 결과 데이터)를 호스트에 카피한다.
	cudaMemcpy(a, d_a, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
	for (int i = 0; i<SIZE; ++i)
	printf("c[%d] = %d\n", i, c[i]);

	// 호스트의 메모리 할당 해제
	free(a);
	free(b);
	free(c);

	// cudaFree(d_a)를 통해 디바이스의 메모리를 할당 해제
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	return 0;
	}

view raw cudaExample1.cu hosted with ❤ by GitHub

저작자표시

'Programming > 병렬처리(CUDA)' 카테고리의 다른 글

CUDA driver version is insufficient for CUDA run time version (0)	2017.04.05
[CUDA] nVidia GPU의 CUDA 관련상세 Specification 정보 알아보기 (0)	2016.10.26
[CUDA] 용어 정리 (0)	2016.10.26
[CUDA] Visual Studio 2013에서 CUDA 개발 환경 구축 (0)	2016.10.26
CUDA C 확장 키워드(CUDA C Extension) (0)	2016.07.05

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

One Day One Line

[CUDA] CUDA C 프로그래밍 예제

Standard C Code

호스트, 디바이스 메모리 할당 예제

'Programming > 병렬처리(CUDA)' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

[CUDA] CUDA C 프로그래밍 예제

Standard C Code

호스트, 디바이스 메모리 할당 예제

'Programming > 병렬처리(CUDA)' 카테고리의 다른 글

'Programming/병렬처리(CUDA)' Related Articles

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역