在10年前,随着CUDA与OpenCL的纷纷出炉,GPGPU也着实热了一把。而现今,不少公司更是将GPGPU作为挖矿、搞机器学习的计算利器。于是乎,有许多言论声称GPU将很快取代CPU!那么现代化的GPGPU是否具有如此强大的威力甚至于能取代CPU呢?本文将会以GPGPU的任务级并行能力来看看这些尖端GPU的性能。
众所周知,当前家用计算机上的CPU都具有多个核心,每个核心能独立执行任务,因此如果有个四核CPU的话,那么它同时至少能执行4个完全独立的任务。而当前GPU同样也具有多个核心,而且往往比CPU更多一些,这些与CPU相对应的核心在OpenCL术语中称为计算单元(compute unit,简称CU)。而GPU的每个核心又有许多小的计算单元构成,这些小的计算单元在OpenCL术语中称为处理元素(processing element,简称PE)。比如,近期的nVidia GPU与AMD GPU上,每个CU可拥有64到128个PE。那么这些CU是否能独立执行不同的任务呢?就像CPU的核心那样?
下面,笔者就拿自己在2018年刚购买的13寸MacBook Pro来做个实验。这台笔记本搭载了Intel Core i7-8559U,以及Intel Iris Plus Graphics 655 GPU。尽管是块核心GPU,但其性能也超越了不少2010年左右的独立显卡😂
下面,笔者将用这块GPU边做OpenGL图形渲染,然后边用OpenCL做通用计算,看看做通用计算时是否会影响到图形渲染。
首先,我们建立一个macOS的Cocoa App。然后先实现模板给出的AppDelegate.m源文件。笔者在这个源文件中加了几行代码,使得我们点击❌按钮之后能将应用也一起关闭。
//
// AppDelegate.m
// GLwithCL
//
// Created by Zenny Chen on 2019/7/30.
// Copyright © 2019 Zenny Chen. All rights reserved.
//
#import "AppDelegate.h"
@interface AppDelegate ()<NSWindowDelegate>
@end
@implementation AppDelegate
- (void)applicationDidFinishLaunching:(NSNotification *)aNotification {
// Insert code here to initialize your application
NSApplication.sharedApplication.windows[0].delegate = self;
}
- (void)applicationWillTerminate:(NSNotification *)aNotification {
// Insert code here to tear down your application
}
- (void)windowWillClose:(NSNotification *)notification
{
[NSApplication.sharedApplication terminate:self];
}
@end
接着,我们再实现模板给出的ViewController.m源文件,该源文件给出了OpenCL的实现逻辑:
//
// ViewController.m
// GLwithCL
//
// Created by Zenny Chen on 2019/7/30.
// Copyright © 2019 Zenny Chen. All rights reserved.
//
#import "ViewController.h"
#import "MyGLView.h"
#include <stdalign.h>
@import OpenCL;
@implementation ViewController
{
@private
/// OpenGL视图对象
MyGLView *mGLView;
/// OpenGL视图对象所处的y坐标
CGFloat mGLViewY;
/// 指示当前是否正在执行CL程序
volatile BOOL mExecutingCL;
}
- (void)viewDidLoad
{
[super viewDidLoad];
// Do any additional setup after loading the view.
const CGSize viewSize = self.view.frame.size;
const CGFloat y = viewSize.height - 10.0 - 30.0;
NSButton *showButton = [NSButton buttonWithTitle:@"Show" target:self action:@selector(showButtonClicked:)];
showButton.frame = NSMakeRect(20.0, y, 90.0, 30.0);
[self.view addSubview:showButton];
NSButton *pauseButton = [NSButton buttonWithTitle:@"Pause" target:self action:@selector(pauseButtonClicked:)];
pauseButton.tag = 0;
pauseButton.frame = NSMakeRect(130.0, y, 90.0, 30.0);
[self.view addSubview:pauseButton];
NSButton *closeButton = [NSButton buttonWithTitle:@"Close" target:self action:@selector(closeButtonClicked:)];
closeButton.frame = NSMakeRect(240.0, y, 90.0, 30.0);
[self.view addSubview:closeButton];
NSButton *computeButton = [NSButton buttonWithTitle:@"Compute" target:self action:@selector(computeButtonClicked:)];
computeButton.frame = NSMakeRect(350.0, y, 90.0, 30.0);
[self.view addSubview:computeButton];
mGLViewY = y - 10.0 - 512.0;
}
// MARK: button event handlers
- (void)showButtonClicked:(id)sender
{
if(mGLView != nil)
return;
const CGSize viewSize = self.view.frame.size;
mGLView = [MyGLView.alloc initWithFrame:NSMakeRect((viewSize.width - 512.0) * 0.5, mGLViewY, 512.0, 512.0)];
[self.view addSubview:mGLView];
[mGLView release];
[mGLView performSelector:@selector(startAnimating) withObject:nil afterDelay:0.1];
}
- (void)pauseButtonClicked:(NSButton*)sender
{
if(mGLView == nil)
return;
if(sender.tag == 0)
{
// 当前处于动画状态
sender.tag = 1;
sender.title = @"Resume";
[mGLView stopAnimating];
}
else
{
// 当前处于暂停状态
sender.tag = 0;
sender.title = @"Pause";
[mGLView startAnimating];
}
}
- (void)closeButtonClicked:(id)sender
{
if(mGLView != nil)
{
[mGLView destroy];
[mGLView removeFromSuperview];
mGLView = nil;
}
}
- (void)computeButtonClicked:(id)sender
{
if(mExecutingCL)
return;
mExecutingCL = YES;
dispatch_async(dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0), ^ {
[self doCompute];
mExecutingCL = NO;
});
}
/// 执行OpenCL计算
- (void)doCompute
{
NSString *sourcePath = [NSBundle.mainBundle pathForResource:@"test" ofType:@"ocl"];
if(sourcePath == nil)
{
NSLog(@"CL source file not found!");
return;
}
cl_platform_id platform = NULL; // 当前所选择的OpenCL平台
cl_context context = NULL; // 当前所创建的OpenCL上下文
cl_command_queue commandQueue = NULL; // OpenCL命令队列
cl_program program = NULL; // OpenCL的执行程序对象
cl_mem inputMemObj = NULL; // 用于输入参数的存储器对象
cl_mem inOutmemObj = NULL; // 用于输入输出参数的存储器对象
cl_kernel kernel = NULL; // OpenCL内核对象
const int maxObjectNumber = 16;
cl_platform_id platforms[maxObjectNumber];
cl_device_id devices[maxObjectNumber];
// 查询OpenCL平台
cl_uint numPlatforms = 0;
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (status != CL_SUCCESS)
{
puts("Error: Getting platforms!");
return;
}
if(numPlatforms > 0)
{
if(numPlatforms > maxObjectNumber)
numPlatforms = maxObjectNumber;
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
if(status != CL_SUCCESS)
{
puts("Get platform failed!");
return;
}
platform = platforms[0];
}
else
{
puts("Your system does not have any OpenCL platform!");
return;
}
// 查询OpenCL设备,这里需要的是GPU
cl_uint numDevices = 0;
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (numDevices == 0 || status != CL_SUCCESS)
{
puts("No GPU device available.");
return;
}
else
{
if(numDevices > maxObjectNumber)
numDevices = maxObjectNumber;
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
}
// 创建OpenCL上下文
context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);
/*Step 4: Creating command queue associate with the context.*/
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
/*Step 5: Create program object */
// Read the kernel code to the buffer
FILE *fp = fopen(sourcePath.UTF8String, "r");
if(fp == NULL)
{
puts("The kernel file open failed!");
goto RELEASE_RESOURCES;
}
fseek(fp, 0, SEEK_END);
size_t kernelLength = ftell(fp);
fseek(fp, 0, SEEK_SET);
char *kernelCodeBuffer = (char*)malloc(kernelLength + 1);
fread(kernelCodeBuffer, 1, kernelLength, fp);
kernelCodeBuffer[kernelLength] = '\0';
fclose(fp);
const char *aSource = kernelCodeBuffer;
program = clCreateProgramWithSource(context, 1, &aSource, &kernelLength, NULL);
// 构建程序
status = clBuildProgram(program, 1,devices,NULL,NULL,NULL);
if(status != CL_SUCCESS)
{
printf("Error: Failed to build program executable: ");
size_t len;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
char *logBuffer = malloc(len + 16);
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, len, logBuffer, NULL);
printf("%s\n", logBuffer);
free(logBuffer);
goto RELEASE_RESOURCES;
}
// 初始化缓存
int alignas(64) inputBuffer[1024];
int alignas(16) inOutObject = 4;
memset(inputBuffer, 0, sizeof(inputBuffer));
// 创建存储器对象
inputMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(inputBuffer), inputBuffer, NULL);
inOutmemObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(inOutObject), &inOutObject, NULL);
// 创建内核对象
kernel = clCreateKernel(program, "test", NULL);
// 设置内核参数
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputMemObj);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inOutmemObj);
if(status != CL_SUCCESS)
{
puts("Kernel argument setting failed!");
goto RELEASE_RESOURCES;
}
NSTimeInterval timestamp = NSProcessInfo.processInfo.systemUptime;
// 使用一个工作组,共256个工作项来执行内核程序
size_t global_work_size[1] = { 256 };
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
// 迫使CL程序执行完毕
clFinish(commandQueue);
timestamp = NSProcessInfo.processInfo.systemUptime - timestamp;
// 读回GPU产生的结果
status |= clEnqueueReadBuffer(commandQueue, inOutmemObj, CL_TRUE, 0, sizeof(inOutObject), &inOutObject, 0, NULL, NULL);
printf("Time spent: %.3fs, result is: %d\n", timestamp, inOutObject);
RELEASE_RESOURCES:
// 清理OpenCL相关资源
if(kernel != NULL)
clReleaseKernel(kernel);
if(program != NULL)
clReleaseProgram(program);
if(inputMemObj != NULL)
clReleaseMemObject(inputMemObj);
if(inOutmemObj != NULL)
clReleaseMemObject(inOutmemObj);
if(commandQueue != NULL)
clReleaseCommandQueue(commandQueue);
if(context != NULL)
clReleaseContext(context);
}
- (void)setRepresentedObject:(id)representedObject {
[super setRepresentedObject:representedObject];
// Update the view, if already loaded.
}
@end
为了避免主线程阻塞的影响,笔者这里特定利用Grand Central Dispatch来分配一个用户线程来执行整个OpenCL的计算。此外,为了尽可能少用GPU计算资源,这里仅分配了一个工作组,共256个工作项进行计算。
随后,我们新增一个类,命名为MyGLView,将它作为NSOpenGLView的子类。我们将对它做基于OpenGL的3D图形渲染。
先给出MyGLView.h的源代码内容:
//
// MyGLView.h
// GLwithCL
//
// Created by Zenny Chen on 2019/7/30.
// Copyright © 2019 Zenny Chen. All rights reserved.
//
@import Cocoa;
/// 用于展示OpenGL的视图
@interface MyGLView : NSOpenGLView
/// 开始启动动画
- (void)startAnimating;
/// 停止动画
- (void)stopAnimating;
/// 销毁当前的OpenGL相关资源。
/// @attention 此方法必须在release或remove该视图对象前调用一次,并且调用完后,此对象将不再有效。
- (void)destroy;
@end
下面给出MyGLView.m的源代码:
//
// MyGLView.m
// GLwithCL
//
// Created by Zenny Chen on 2019/7/30.
// Copyright © 2019 Zenny Chen. All rights reserved.
//
#import "MyGLView.h"
#include <OpenGL/gl.h>
@implementation MyGLView
{
@private
/// 用于动画刷新
CVDisplayLinkRef mDisplayLink;
/// 当前图形的旋转角度
float mRotationDegree;
/// 指示当前动画是否已停止
BOOL mAnimatingStopped;
}
- (instancetype)initWithFrame:(NSRect)frameRect
{
// 指定像素格式属性
NSOpenGLPixelFormatAttribute attrs[] = {
// 使用GPU硬件加速来绘制OpenGL
NSOpenGLPFAAccelerated,
// 可选地,我们这里使用了双缓冲机制
NSOpenGLPFADoubleBuffer,
// 由于我们这里就用固定功能流水线,因此直接是用legacy的OpenGL版本即可
NSOpenGLPFAOpenGLProfile, NSOpenGLProfileVersionLegacy,
// 采用32位像素颜色(RGBA8888)
NSOpenGLPFAColorSize, 32,
// 采用16位深度缓存
NSOpenGLPFADepthSize, 16,
// 采用8位模板缓存
NSOpenGLPFAStencilSize, 8,
// 开启多重采样反走样
NSOpenGLPFAMultisample,
// 指定一个用于MSAA的缓存
NSOpenGLPFASampleBuffers, 1,
// 指定MSAA使用四个样本
NSOpenGLPFASamples, 4,
// 属性指定结束
0
};
// 创建像素格式
NSOpenGLPixelFormat *pixelFormat = [NSOpenGLPixelFormat.alloc initWithAttributes:attrs];
self = [super initWithFrame:frameRect pixelFormat:pixelFormat];
[pixelFormat release];
mAnimatingStopped = YES;
return self;
}
- (BOOL)isOpaque
{
return YES;
}
/// 矩形顶点坐标
static const GLfloat sRectVertices[] = {
// 左上顶点
-0.4f, 0.4f,
// 左下顶点
-0.4f, -0.4f,
// 右上顶点
0.4f, 0.4f,
// 右下顶点
0.4f, -0.4f
};
/// 矩形顶点颜色
static const GLfloat sRectColors[] = {
// 左上顶点,红色
1.0f, 0.0f, 0.0f, 1.0f,
// 左下顶点,绿色
0.0f, 1.0f, 0.0f, 1.0f,
// 右上顶点,蓝色
0.0f, 0.0f, 1.0f, 1.0f,
// 右下顶点,白色
1.0f, 1.0f, 1.0f, 1.0f
};
static CVReturn renderCallback(CVDisplayLinkRef displayLink,
const CVTimeStamp *inNow,
const CVTimeStamp *inOutputTime,
CVOptionFlags flagsIn,
CVOptionFlags *flagsOut,
void *displayLinkContext)
{
MyGLView *glView = (MyGLView*)displayLinkContext;
[glView performSelectorOnMainThread:@selector(render) withObject:nil waitUntilDone:NO];
return kCVReturnSuccess;
}
- (void)prepareOpenGL
{
[super prepareOpenGL];
const char *version = (const char *)glGetString(GL_VERSION);
const char * vendor = (const char *)glGetString(GL_VENDOR);
const char * renderer = (const char *)glGetString(GL_RENDERER);
printf("Current OpenGL version: %s, vendor is: %s\n", version, vendor);
printf("Current OpenGL renderer: %s\n", renderer);
// 初始化显示同步连接
CVDisplayLinkCreateWithCGDisplay(CGMainDisplayID(), &mDisplayLink);
CVDisplayLinkSetOutputCallback(mDisplayLink, renderCallback, self);
// 开启面切除
glEnable(GL_CULL_FACE);
// 指定逆时针方向为正面
glFrontFace(GL_CCW);
// 切除背面
glCullFace(GL_BACK);
// 使用梯度着色模型
glShadeModel(GL_SMOOTH);
// 设置颜色缓存清除色
glClearColor(0.4, 0.5, 0.4, 1.0);
// 开启主机端的顶点数组功能
glEnableClientState(GL_VERTEX_ARRAY);
// 开启主机端的颜色数组功能
glEnableClientState(GL_COLOR_ARRAY);
// 设置视口大小
const CGSize viewPort = self.frame.size;
glViewport(0, 0, viewPort.width * self.layer.contentsScale, viewPort.height * self.layer.contentsScale);
// 做投影变换
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(-1.0, 1.0, -1.0, 1.0, 1.0, 5.0);
// 后续将一直做视图模型变换
glMatrixMode(GL_MODELVIEW);
}
/// 刷新视图的显示
- (void)render
{
[self setNeedsDisplay:YES];
}
- (void)drawRect:(NSRect)dirtyRect
{
[super drawRect:dirtyRect];
// Drawing code here.
// 清除颜色缓存
glClear(GL_COLOR_BUFFER_BIT);
// 绘制矩形
glLoadIdentity();
glTranslatef(0.0f, 0.0f, -3.0f);
glRotatef(mRotationDegree, 0.0f, 0.0f, 1.0f);
glVertexPointer(2, GL_FLOAT, 0, sRectVertices);
glColorPointer(4, GL_FLOAT, 0, sRectColors);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
// 更新旋转角度
if(++mRotationDegree >= 360.0f)
mRotationDegree = 0.0f;
glFlush();
[NSOpenGLContext.currentContext flushBuffer];
}
- (void)startAnimating
{
if(!mAnimatingStopped)
return;
CVDisplayLinkStart(mDisplayLink);
mAnimatingStopped = NO;
}
- (void)stopAnimating
{
if(mAnimatingStopped)
return;
CVDisplayLinkStop(mDisplayLink);
mAnimatingStopped = YES;
}
- (void)destroy
{
CVDisplayLinkRelease(mDisplayLink);
}
@end
这里的渲染很简单,就画一个旋转的正方向,这其实并不会动用太多GPU资源。
最后,我们给出OpenCL内核源代码。我们可以将它保存为test.ocl,然后以文件资源的形式添加到当前Xcode项目工程中。使用ocl作为后缀名是避免Xcode将它作为源文件进行编译,而不是资源文件;Xcode会将cl文件作为一个编译源文件来对待。
kernel void test(global int4 *pInBuf, global int *pInOut)
{
local int4 localMem[256];
const int index = get_local_id(0);
const int param = *pInOut;
// 先将数据存放到本地存储器
localMem[index] = pInBuf[index];
barrier(CLK_LOCAL_MEM_FENCE);
const int nLoops = param * 10000000;
for(int i = 0; i < nLoops; i++)
{
const int4 value = localMem[index];
localMem[index] = value + int4(1);
}
barrier(CLK_LOCAL_MEM_FENCE);
if(index == 0)
{
// 对第一个工作项执行最后的求和操作
int4 sum = int4(0);
for(int i = 0; i < 256; i++)
sum += localMem[i];
*pInOut = sum.x + sum.y + sum.z + sum.w;
}
}
这段CL内核源代码也十分简单,就是通过一个循环对GPU的本地存储器不断做递增操作。
全都完成之后,我们可以看到以下效果:
我们可以看到,当我们按下“Compute”按钮之后,不光是OpenGL的渲染,整个窗口的UI全都冻结住了!这说明OpenCL的内核程序的执行完全独占了整个GPU计算资源,而不是部分!所以我们对待GPU还是应该像对待一块加速硬件那样,当你在某个时刻用它的时候,其资源将会被独占,直到任务完成或失败而退出。而由此我们应该能清醒地认识到,GPU对于某些高性能计算领域确实有着十分不错的计算性能优势,但是想取代CPU,那是还差着十万八千里呢~