【NVIDIA】获取GPU利用率-cpp.md

在深度学习推理中，为了更加高效的利用 GPU，在多个推理任务实例中，创建新的实例以及分配到不同的 GPU 设备上，需要关注到当前 GPU 还有多少剩余，以便更好的分配

代码目录

.
├── CMakeLists.txt
├── src
│   └── main.cpp
├── ubuntu_build.sh
└── win10_vs2019_build.bat

windows

前提条件

确保已经安装了 Nvidia 驱动和 CUDA 安装包

nvidia-smi.exe

可以运行，截图如下：
在这里插入图片描述

nvcc --version

可以运行，截图如下：
在这里插入图片描述

cuda 安装目录

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8

修改 CMakeLists.txt 文件中的 CUDA_ROOT 为自己安装的目录

NOTE: 注意斜杠 / 和反斜杠 \

构建项目

打开控制台，执行 win10_vs2019_build.bat , 或者直接双击 win10_vs2019_build.bat

NOTE: 前提是安装了 vs2019, “Visual Studio 16 2019”，其他 VS 版本可以同步替换

编译

进入 win10_build 目录，双击 get_gpu_info.sln，修改编译类型为 Release，编译后生成 get_gpu_info.exe，双击运行即可。

linux

在这里插入图片描述

构建，编译，执行

bash ubuntu_build.sh

在这里插入图片描述

代码附录

src/main.cpp

/***************************************************************************\
|*                                                                           *|
|*      Copyright 2010-2016 NVIDIA Corporation.  All rights reserved.        *|
|*                                                                           *|
|*   NOTICE TO USER:                                                         *|
|*                                                                           *|
|*   This source code is subject to NVIDIA ownership rights under U.S.       *|
|*   and international Copyright laws.  Users and possessors of this         *|
|*   source code are hereby granted a nonexclusive, royalty-free             *|
|*   license to use this code in individual and commercial software.         *|
|*                                                                           *|
|*   NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE     *|
|*   CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR         *|
|*   IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH      *|
|*   REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF         *|
|*   MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR          *|
|*   PURPOSE. IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL,            *|
|*   INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES          *|
|*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN      *|
|*   AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING     *|
|*   OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE      *|
|*   CODE.                                                                   *|
|*                                                                           *|
|*   U.S. Government End Users. This source code is a "commercial item"      *|
|*   as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting       *|
|*   of "commercial computer  software" and "commercial computer software    *|
|*   documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)   *|
|*   and is provided to the U.S. Government only as a commercial end item.   *|
|*   Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through        *|
|*   227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the       *|
|*   source code with only those rights set forth herein.                    *|
|*                                                                           *|
|*   Any use of this source code in individual and commercial software must  *|
|*   include, in the user documentation and internal comments to the code,   *|
|*   the above Disclaimer and U.S. Government End Users Notice.              *|
|*                                                                           *|
|*                                                                           *|
\***************************************************************************/

#include <stdio.h>
#include <nvml.h>

static const char *convertToComputeModeString(nvmlComputeMode_t mode)
{
    switch (mode)
    {
    case NVML_COMPUTEMODE_DEFAULT:
        return "Default";
    case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:
        return "Exclusive_Thread";
    case NVML_COMPUTEMODE_PROHIBITED:
        return "Prohibited";
    case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:
        return "Exclusive Process";
    default:
        return "Unknown";
    }
}


int main(int argc, char* argv[])
{
    nvmlReturn_t result;
    unsigned int device_count;

    // 初始化 NVML
    result = nvmlInit();
    if (result != NVML_SUCCESS)
    {
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
        printf("Press ENTER to continue...\n");
        getchar();
        return (int)(result);
    }

    // 获取设备数量
    result = nvmlDeviceGetCount(&device_count);
    if (result != NVML_SUCCESS)
    {
        printf("Failed to get device count: %s\n", nvmlErrorString(result));
        printf("Press ENTER to continue...\n");
        getchar();
        return (int)(result);
    }

    // 遍历设备数量
    printf("\nFound %u device%s, Listing devices:\n", device_count, 
        device_count != 1 ? "s" : "");
    printf("--------------------------------------------------------------\n");
    printf("| Device ID | Device Name\t\t|      pci.busId     | GPU Util | Mem Util |\n");
    for (int i = 0; i < device_count; ++i) 
    {
        nvmlDevice_t device;
        char device_name[NVML_DEVICE_NAME_BUFFER_SIZE];
        nvmlPciInfo_t pci;
        nvmlComputeMode_t compute_mode;

        // 获取设备, 也可以使用其他方式来获取设备
        // nvmlDeviceGetHandleBySerial
        // nvmlDeviceGetHandleByPciBusId
        result = nvmlDeviceGetHandleByIndex(i, &device);
        if (result != NVML_SUCCESS)
        {
            printf("Failed to get handle for device %d: %s\n", i, 
                nvmlErrorString(result));
            continue;
        }        

        // 获取 GPU 设备名称    
        result = nvmlDeviceGetName(device, device_name, NVML_DEVICE_NAME_BUFFER_SIZE);
        if (result != NVML_SUCCESS)
        {
            printf("Failed to get name for device %d: %s\n", i, 
                nvmlErrorString(result));
            continue;
        }

        // pci.busId is very useful to know which device physically you're talking to
        // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
        result = nvmlDeviceGetPciInfo(device, &pci);
        if (result != NVML_SUCCESS)
        {
            printf("Failed to get pci info for device %u: %s\n", i, 
                nvmlErrorString(result));
            continue;
        }

        // 获取 GPU 设备的利用率
        nvmlUtilization_st device_utilization;
        result = nvmlDeviceGetUtilizationRates(device, &device_utilization);
        if (result != NVML_SUCCESS)
        {
            printf("Failed to get utilization for device %d: %s\n", i, 
                nvmlErrorString(result));
            continue;
        }

        printf("|     %d     | %s\t| [%s] |    %u %%  |    %u %%   | \n",
               i, device_name, pci.busId, device_utilization.gpu, device_utilization.memory);
        printf("--------------------------------------------------------------\n");

        // 改变 GPU 状态的简单示例
        result = nvmlDeviceGetComputeMode(device, &compute_mode);
        if (NVML_ERROR_NOT_SUPPORTED == result)
        {
            printf("\t This is not CUDA capable device\n");
        }       
        else if (NVML_SUCCESS != result)
        {
            printf("Failed to get compute mode for device %u: %s\n", i, nvmlErrorString(result));
            continue;
        }
        else
        {
            // try to change compute mode
            printf("\t Changing device's compute mode from '%s' to '%s'\n",
                convertToComputeModeString(compute_mode),
                convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));

            result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
            if (NVML_ERROR_NO_PERMISSION == result)
            {
                printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));
            }        
            else if (NVML_ERROR_NOT_SUPPORTED == result)
            {
                printf("\t\t Compute mode prohibited not supported. You might be running on\n"
                    "\t\t windows in WDDM driver model or on non-CUDA capable GPU\n");
            }
            else if (NVML_SUCCESS != result)
            {
                printf("\t\t Failed to set compute mode for device %u: %s\n", i, nvmlErrorString(result));
                continue;
            }
            else
            {
                printf("\t Restoring device's compute mode back to '%s'\n",
                    convertToComputeModeString(compute_mode));
                result = nvmlDeviceSetComputeMode(device, compute_mode);
                if (NVML_SUCCESS != result)
                {
                    printf("\t\t Failed to restore compute mode for device %u: %s\n", i, nvmlErrorString(result));
                    continue;
                }
            }
        }
    }

    // 关闭 NVML 
    nvmlShutdown();
    if (NVML_SUCCESS != result)
    {
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
        printf("Press ENTER to continue...\n");
        getchar();
        return (int)(result);
    }

    printf("All done.\n");
    printf("Press ENTER to continue...\n");
    getchar();
    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.0.0)
set(PROJECT_NAME get_gpu_info)
project(${PROJECT_NAME})

SET(CMAKE_CONFIGURATION_TYPES ${CMAKE_BUILD_TYPE} CACHE STRING "Release" FORCE)

add_executable(${PROJECT_NAME} src/main.cpp)

if (WIN32)
    set(CUDA_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8")
    include_directories("${CUDA_ROOT}/include/")
    target_link_libraries(${PROJECT_NAME} "${CUDA_ROOT}/lib/x64/nvml.lib")
endif()

if (UNIX)
    set(CUDA_ROOT "/usr/local/cuda")
    include_directories("${CUDA_ROOT}/include/")
    link_directories("${CUDA_ROOT}/lib64/stubs")
    target_link_libraries(${PROJECT_NAME}  libnvidia-ml.so)
endif()

win10_vs2019_build.bat

::在主CMakeLists.txt 里设置opencv和ncnn的路径
set build_dir=win10_build

::删除编译目录
rm -rf %build_dir%

::重新创建编译目录
mkdir %build_dir%

::进入编译目录
cd %build_dir%

::配置, 此处可以利用 -D 添加编译选项
cmake -G "Visual Studio 16 2019" -A x64 -DCMAKE_BUILD_TYPE=Release ..

::退出目录
cd ..

ubuntu_build.sh

#!/bin/bash

build_dir=ubuntu2204_build

# 删除编译目录
rm -rf ${build_dir}

# 重新创建目录
mkdir ${build_dir}

# 进入目录
cd ${build_dir}

# 构建项目
cmake -DCMAKE_BUILD_TYPE=RELEASE .. 

# 编译
make -j8

# 拷贝出来
cp get_gpu_info ../
cd ..

# 执行
./get_gpu_info