unified-runtime编译与验证
- 一.创建容器
- 二.编译unified-runtime
- 三.生成一个cuda ptx kernel
- 四.API测试
unified-runtime编译与验证
一.创建容器
docker run --gpus all --shm-size=32g -ti \
-e NVIDIA_VISIBLE_DEVICES=all --privileged --net=host \
--rm -it \
-v $PWD:/home \
-w /home ghcr.io/intel/llvm/ubuntu2204_build /bin/bash
二.编译unified-runtime
git clone https://github.com/oneapi-src/unified-runtime
cd unified-runtime
mkdir build
cd build
cmake -DUR_BUILD_ADAPTER_CUDA=ON -DUR_BUILD_ADAPTER_NATIVE_CPU=ON -DUMF_DISABLE_HWLOC=ON ..
make
三.生成一个cuda ptx kernel
tee cuda_copy.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
__global__ void kernel_copy(float *input,float *output)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
output[tid]=input[tid];
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx cuda_copy.cu -o cuda_copy.ptx
四.API测试
tee um_query_device.cpp<<-'EOF'
#include <iostream>
#include <memory>
#include <stdlib.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>
#include "ur_api.h"
#define ur_check(call) \
do { \
ur_result_t error = call; \
if (error != UR_RESULT_SUCCESS) { \
fprintf(stderr, " error in file '%s' in line %i: %d.\n", __FILE__, __LINE__,error); \
exit(EXIT_FAILURE); \
} \
} while (0)
constexpr unsigned PAGE_SIZE = 4096;
template <typename T, size_t N> struct alignas(PAGE_SIZE) AlignedArray {
T data[N];
};
int main(int, char *[]) {
ur_result_t status;
ur_check(urLoaderInit(0, nullptr));
std::cout << "Platform initialized.\n";
uint32_t adapterCount = 0;
std::vector<ur_adapter_handle_t> adapters;
uint32_t platformCount = 0;
std::vector<ur_platform_handle_t> platforms;
ur_check(urAdapterGet(0, nullptr, &adapterCount));
adapters.resize(adapterCount);
ur_check(urAdapterGet(adapterCount, adapters.data(), nullptr));
ur_check(urPlatformGet(adapters.data(), adapterCount, 1, nullptr,&platformCount));
platforms.resize(platformCount);
ur_check(urPlatformGet(adapters.data(), adapterCount, platformCount,
platforms.data(), nullptr));
for (auto p : platforms) {
ur_api_version_t api_version = {};
ur_check(urPlatformGetApiVersion(p, &api_version));
std::cout << "API version: " << UR_MAJOR_VERSION(api_version) << "."
<< UR_MINOR_VERSION(api_version) << std::endl;
uint32_t deviceCount = 0;
ur_check(urDeviceGet(p, UR_DEVICE_TYPE_GPU, 0, nullptr, &deviceCount));
std::vector<ur_device_handle_t> devices(deviceCount);
ur_check(urDeviceGet(p, UR_DEVICE_TYPE_GPU, deviceCount, devices.data(),
nullptr));
for (auto d : devices) {
ur_device_type_t device_type = UR_DEVICE_TYPE_ALL;
ur_check(urDeviceGetInfo(
d, UR_DEVICE_INFO_TYPE, sizeof(ur_device_type_t),
static_cast<void *>(&device_type), nullptr));
static const size_t DEVICE_NAME_MAX_LEN = 1024;
char device_name[DEVICE_NAME_MAX_LEN] = {0};
ur_check(urDeviceGetInfo(d, UR_DEVICE_INFO_NAME, DEVICE_NAME_MAX_LEN - 1,
static_cast<void *>(&device_name), nullptr));
if (device_type == UR_DEVICE_TYPE_GPU) {
std::cout << "Found a " << device_name << " gpu.\n";
}
ur_context_handle_t hContext;
ur_check(urContextCreate(1, &d, nullptr, &hContext));
std::ifstream inputFile("cuda_copy.ptx");
std::ostringstream buffer;
buffer << inputFile.rdbuf();
std::string fileContent = buffer.str();
inputFile.close();
ur_program_handle_t hProgram;
ur_check(urProgramCreateWithBinary(hContext, d, fileContent.length(), (const uint8_t *)fileContent.c_str(),nullptr,&hProgram));
constexpr int a_size = 32;
AlignedArray<float, a_size> a, b;
for (auto i = 0; i < a_size; ++i) {
a.data[i] = a_size - i;
b.data[i] = 0;
}
status=urProgramBuild(hContext, hProgram, nullptr);
ur_mem_handle_t dA, dB;
ur_check(urMemBufferCreate(hContext, UR_MEM_FLAG_READ_WRITE,
a_size * sizeof(int), nullptr, &dA));
ur_check(urMemBufferCreate(hContext, UR_MEM_FLAG_READ_WRITE,
a_size * sizeof(int), nullptr, &dB));
ur_kernel_handle_t hKernel;
ur_check(urKernelCreate(hProgram, "_Z11kernel_copyPfS_", &hKernel));
ur_check(urKernelSetArgMemObj(hKernel, 0, nullptr, dA));
ur_check(urKernelSetArgMemObj(hKernel, 1, nullptr, dB));
ur_queue_handle_t queue;
ur_check(urQueueCreate(hContext, d, nullptr, &queue));
ur_check(urEnqueueMemBufferWrite(queue, dA, true, 0, a_size * sizeof(float),
a.data, 0, nullptr, nullptr));
ur_check(urEnqueueMemBufferWrite(queue, dB, true, 0, a_size * sizeof(float),
b.data, 0, nullptr, nullptr));
const size_t gWorkOffset[] = {0, 0, 0};
const size_t gWorkSize[] = {a_size, 1, 1};
const size_t lWorkSize[] = {1, 1, 1};
ur_event_handle_t event;
ur_check(urEnqueueKernelLaunch(queue, hKernel, 3, gWorkOffset, gWorkSize,
lWorkSize, 0, nullptr, &event));
ur_check(urEnqueueMemBufferRead(queue, dB, true, 0, a_size * sizeof(int),
b.data, 1, &event, nullptr));
ur_check(urQueueFinish(queue));
ur_check(urContextRelease(hContext));
for (auto i = 0; i < a_size; ++i) {
printf("%.2f\n",b.data[i]);
}
}
}
out:
for (auto adapter : adapters) {
urAdapterRelease(adapter);
}
urLoaderTearDown();
return status == UR_RESULT_SUCCESS ? 0 : 1;
}
EOF
g++ -o um_query_device um_query_device.cpp -I../include -L lib -lur_loader -lpthread
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib ./um_query_device