AMD GPU 内核驱动分析(三)-dma-fence 同步工作模型

在Linux Kernel 的AMDGPU驱动实现中，dma-fence扮演着重要角色，AMDGPU的Render/解码操作可能涉及到多个方面同时引用buffer的情况，以渲染/视频解码场景为例，应用将渲染/解码命令写入和GPU共享的BUFFER之后，需要将任务提交给GPU的运行队列等待执行，这个时候，应用不仅要监听buffer中任务的执行情况，还要保证在任务执行完毕之前，不能再去修改BUFFER中的数据。而AMDGPU也需要排它性地执行BUFFER命令。在GPU执行结束，应用希望及时得到执行完的信息，以便回收BUFFER重新利用，通知一般由绑定到某个BUFFER上的中断完成。这些操作，如果使用经典的共享BUFFER+锁的方式进行保护和同步，不但效率十分低下，而且各类内核机制杂糅在一起，缺乏一个统一的管理框架，使代码难以维护。 dma-fence则提供了一套简单便捷的机框架，将原子操作，休眠唤醒，同步/异步事件通知等功能嵌入到各种类型的BUFFER管理框架中，将各类机制有机的结合在一起，减少了用户态空间的忙等，使buffer使用更加智能高效。

以AMDGPU解码视频为例，利用dma-fence ring buffer隔离了应用解码请求和解码任务任务本身，提交任务和解码完成通知均通过BUFFER绑定的dma-fence进行交互：

为了便于分析，我把5.4内核中AMDGPU这部分的实现提取出来，写了一个可以独立加载执行的内核模块demo，类似于CMODEL，任务提交上下文和解码完成通知上下文在AMDGPU驱动中分别用内核线程和中断实现，在demo中则全部用内核线程实现.

工作模型如下图所示：

fence array有256个slot,每个位置代表一个fence.构成一个RING BUFFER。sync_seq为写指针序号，sync_seq只增不减，sync_seq mod 256为在array中的index，相当于RING BUFFER的写指针。

last_seq代表当前已经消耗掉（已完成处理的和fence绑定的buffer)的序号，同sync_seq一样，它也是单调递增的自然序列，last_seq mod 256 为在array中的index，相当于ring buffer的读指针。

而fence_seq则表示写位置的一个抽样，这是一个在AMDGPU中的hardware ring是所有进程共享的，每个进程随时可能会提交新的渲染任务，所以sync_seq时刻在更新，不适合处理。fence_seq则是某个时间点的快照，在last_seq和fence_seq之间的buffer(fence)，都会在下一批中一并得到signal.

当提交过快，但是消耗较慢时，写方追上读方，此时需要进行生产-消费之间的同步，读端将会反压到写端，写端调用dma_fence_wait进行同步：

当消费方完成下一个任务后，释放当前slot，并且调用dma_fence_signal唤醒写线程，写线程受到信号后，继续执行，向fence array中填充新的任务：

dma fence 的free

拔出萝卜带出泥，通过job fence的free触发schedule fence和finish fence的释放，释放是通过引用计数归零触发的回调，在回调用中利用RCU异步释放FENCE占用的空间。

以上就是fence array的使用过程中的几个corner case.

附code：

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/dma-fence.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <uapi/linux/sched/types.h>

#define assert(expr)   \
        if (!(expr)) { \
                printk( "Assertion failed! %s,%s,%s,line=%d\n",\
                                #expr,__FILE__,__func__,__LINE__); \
                BUG(); \
        }

#define num_hw_submission               128
struct fence_driver {
	uint64_t                        gpu_addr;
	volatile uint32_t               *cpu_addr;
	uint32_t                        sync_seq;
	atomic_t                        last_seq;
	bool                            initialized;
	bool                            initialized_emit;
	bool                            initialized_recv;
	unsigned                        num_fences_mask;
	spinlock_t                      lock;
	struct dma_fence                **fences;
	struct mutex                    mutex;
	struct dma_fence                *last_fence;
	struct timer_list               timer;
	struct timer_list               work_timer;
	wait_queue_head_t               job_scheduled;
};

struct fence_set {
	struct dma_fence                scheduled;
	struct dma_fence                finished;
	spinlock_t                      lock;
};

struct job_fence {
	struct dma_fence                job;
	void                            *data;
};

static uint32_t fence_seq;
static struct fence_driver *ring;
static struct task_struct *fence_emit_task;
static struct task_struct *fence_recv_task;
static struct kmem_cache *job_fence_slab;
static struct kmem_cache *sched_fence_slab;

static const char *dma_fence_get_name(struct dma_fence *fence)
{
	return "dma-fence-drv";
}

static bool dma_fence_enable_signal(struct dma_fence *fence)
{
	if (!timer_pending(&ring->work_timer)) {
		mod_timer(&ring->work_timer, jiffies + HZ / 10);
	}

	printk("%s line %d, signal fenceno %lld.\n", __func__, __LINE__, fence->seqno);

	return true;
}

void fencedrv_fence_free(struct rcu_head *rcu)
{
	struct fence_set *fs;
	struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
	struct job_fence *jb = container_of(f, struct job_fence, job);

	assert(jb->data != NULL);

	fs = (struct fence_set *)jb->data;

	// the dma_fence_get must be symmentry with dma_fence_put during all the alive.
	assert((kref_read(&fs->scheduled.refcount) == 1));
	assert((kref_read(&fs->finished.refcount) == 1));

	//dma_fence_put(&fs->scheduled);
	dma_fence_put(&fs->finished);
	kmem_cache_free(job_fence_slab, jb);

	// dump_stack();
}

static void fencedrv_dma_fence_release(struct dma_fence *fence)
{
	// typically usage for dma fence release by rcu.
	call_rcu(&fence->rcu, fencedrv_fence_free);
}

static const struct dma_fence_ops fence_ops = {
	.get_driver_name = dma_fence_get_name,
	.get_timeline_name = dma_fence_get_name,
	.enable_signaling = dma_fence_enable_signal,
	.release = fencedrv_dma_fence_release,
};

static int32_t fencedrv_get_ring_avail(void)
{
	uint32_t read_seq, write_seq;

	do {
		read_seq = atomic_read(&ring->last_seq);
		write_seq = ring->sync_seq;
	} while (atomic_read(&ring->last_seq) != read_seq);

	read_seq &= ring->num_fences_mask;
	write_seq &= ring->num_fences_mask;

	pr_err("%s line %d, read_seq %d, write_seq %d.\n",
	       __func__, __LINE__, read_seq, write_seq);

	if (read_seq <= write_seq) {
		return write_seq - read_seq;
	} else {
		return write_seq + num_hw_submission * 2 - read_seq;
	}
}

static const char *dma_fence_get_name_scheduled(struct dma_fence *fence)
{
	return "dma-fence-scheduled";
}

static const char *dma_fence_get_name_finished(struct dma_fence *fence)
{
	return "dma-fence-finished";
}

static void sched_fence_free(struct rcu_head *head);
static void fencedrv_dma_fence_release_scheduled(struct dma_fence *fence)
{
	struct fence_set *fs = container_of(fence, struct fence_set, scheduled);

	// typically usage for dma fence release by rcu.
	call_rcu(&fs->finished.rcu, sched_fence_free);
}

static void fencedrv_dma_fence_release_finished(struct dma_fence *fence)
{
	struct fence_set *fs = container_of(fence, struct fence_set, finished);
	dma_fence_put(&fs->scheduled);
	//while (1);
}


static const struct dma_fence_ops fence_scheduled_ops = {
	.get_driver_name = dma_fence_get_name_scheduled,
	.get_timeline_name = dma_fence_get_name_scheduled,
	.release = fencedrv_dma_fence_release_scheduled,
};

static const struct dma_fence_ops fence_finished_ops = {
	.get_driver_name = dma_fence_get_name_finished,
	.get_timeline_name = dma_fence_get_name_finished,
	.release = fencedrv_dma_fence_release_finished,
};

static struct fence_set *to_sched_fence(struct dma_fence *f)
{
	if (f->ops == &fence_scheduled_ops) {
		return container_of(f, struct fence_set, scheduled);
	}

	if (f->ops == &fence_finished_ops) {
		return container_of(f, struct fence_set, finished);
	}

	return NULL;
}

static void sched_fence_free(struct rcu_head *head)
{
	struct dma_fence *f = container_of(head, struct dma_fence, rcu);
	struct fence_set *fs = to_sched_fence(f);
	if (fs == NULL)
		return;

	assert(f == &fs->finished);

	kmem_cache_free(sched_fence_slab, fs);
	dump_stack();
}

static struct fence_set *init_fence_set(void)
{
	struct fence_set *fs = kmem_cache_alloc(sched_fence_slab, GFP_KERNEL);
	if (fs == NULL) {
		pr_err("%s line %d, alloc fence set from fence set slab failure.\n",
		       __func__, __LINE__);
		return NULL;
	}

	spin_lock_init(&fs->lock);

	dma_fence_init(&fs->scheduled, &fence_scheduled_ops, &fs->lock, 0, 0);
	dma_fence_init(&fs->finished, &fence_finished_ops, &fs->lock, 0, 0);

	return fs;
}

// ref amdgpu_fence_process
static int fence_recv_task_thread(void *data)
{
	struct sched_param sparam = {.sched_priority = 1};
	sched_setscheduler(current, SCHED_FIFO, &sparam);

	//mutex_lock(&ring->mutex);
	while (ring->initialized == false) {
		set_current_state(TASK_UNINTERRUPTIBLE);
		if (ring->initialized == true) {
			break;
		}
		//mutex_unlock(&ring->mutex);
		schedule();
		//mutex_lock(&ring->mutex);
	}

	set_current_state(TASK_RUNNING);
	//mutex_unlock(&ring->mutex);

	while (!kthread_should_stop() && ring->initialized_recv == true) {
		uint32_t seqno_next = 0;
		uint32_t seq, last_seq;
		int r;

		do {
			// last_seq is the read pointer of fence ring buffer.
			last_seq = atomic_read(&ring->last_seq);
			seq = *ring->cpu_addr;

			if (kthread_should_stop())
				return 0;
		} while (atomic_cmpxchg(&ring->last_seq, last_seq, seq) != last_seq);

		if (del_timer(&ring->work_timer) &&
		    seq != ring->sync_seq) {
			mod_timer(&ring->work_timer, jiffies + HZ / 10);
		}

		//printk("%s line %d, last_seq %d, seq %d, sync_seq %d.\n", __func__, __LINE__, last_seq, seq, ring->sync_seq);

		if (unlikely(seq == last_seq)) {
			msleep(10);
			continue;
		}

		assert(seq > last_seq);

		last_seq &= ring->num_fences_mask;
		seq &= ring->num_fences_mask;

		//printk("%s line %d, last_seq %d, seq %d, sync_seq %d.\n", __func__, __LINE__, last_seq, seq, ring->sync_seq);
		do {
			struct dma_fence *fence, **ptr;

			++last_seq;
			last_seq &= ring->num_fences_mask;
			ptr = &ring->fences[last_seq];
			fence = rcu_dereference_protected(*ptr, 1);

			RCU_INIT_POINTER(*ptr, NULL);
			if (!fence) {
				continue;
			}

			if (seqno_next == 0 || seqno_next == fence->seqno) {
				seqno_next = fence->seqno + 1;
			} else { /*if (seqno_next != 0 && seqno_next != fence->seqno)*/
				pr_err("%s line %d, seqno is not continue, exptect %d, actual %lld.\n",
				       __func__, __LINE__, seqno_next, fence->seqno);
			}

			printk("%s line %d, last_seq/slot %d, seq %d, signal %lld.\n",
			       __func__, __LINE__, last_seq, seq, fence->seqno);

			if (list_empty(&fence->cb_list)) {
				printk("%s line %d, fence cb list is empty.\n",
				       __func__, __LINE__);
			} else {
				printk("%s line %d, fence cb list is not empty.\n",
				       __func__, __LINE__);
			}

			r = dma_fence_signal(fence);
			if (kthread_should_stop()) {
				dma_fence_put(fence);
				return 0;
			}

			if (r) {
				pr_err("%s line %d, fence already signaled.\n",
				       __func__, __LINE__);
				continue;
				//BUG();
			}

			dma_fence_put(fence);
		} while (last_seq != seq);

		wake_up(&ring->job_scheduled);
	}

	set_current_state(TASK_RUNNING);

	return 0;
}

// ref amdgpu_fence_emit.
static int fence_emit_task_thread(void *data)
{
	int r;
	uint64_t oldwaitseqno = 0;
	struct sched_param sparam = {.sched_priority = 1};

	sched_setscheduler(current, SCHED_FIFO, &sparam);

	//mutex_lock(&ring->mutex);
	while (ring->initialized == false) {
		set_current_state(TASK_UNINTERRUPTIBLE);
		if (ring->initialized == true) {
			break;
		}

		//mutex_unlock(&ring->mutex);
		schedule();
		//mutex_lock(&ring->mutex);
	}

	set_current_state(TASK_RUNNING);
	//mutex_unlock(&ring->mutex);

	while (!kthread_should_stop() && ring->initialized_emit == true) {
#if 0
		msleep(1000);
		printk("%s line %d.\n", __func__, __LINE__);
#else
		struct dma_fence __rcu **ptr;
		struct job_fence *fence;
		uint32_t seq;
		struct fence_set *fs = init_fence_set();

		fence = kmem_cache_alloc(job_fence_slab, GFP_KERNEL);
		if (fence == NULL) {
			pr_err("%s line %d, alloc fence from fence slab failure.\n",
			       __func__, __LINE__);
			return -1;
		}

		// ring->sync_seq is fence ring write pointer.
		seq = ++ring->sync_seq;
		dma_fence_init(&fence->job, &fence_ops, &ring->lock, 0, seq);
		fence->data = fs;

		ptr = &ring->fences[seq & ring->num_fences_mask];

		//printk("%s line %d, seq = %d.\n", __func__, __LINE__, seq);

		if (kthread_should_stop()) {
			// will call fence_ops.release directly to free the fence.
			dma_fence_put(&fence->job);
			continue;
		}

		if (unlikely(rcu_dereference_protected(*ptr, 1))) {
			struct dma_fence *old;
			int diff;

			rcu_read_lock();
			old = dma_fence_get_rcu_safe(ptr);
			rcu_read_unlock();

			if (old) {
				mutex_lock(&ring->mutex);
				//dma_fence_get(old);
				ring->last_fence = old;
				mutex_unlock(&ring->mutex);

				r = dma_fence_wait(old, false);

				mutex_lock(&ring->mutex);
				ring->last_fence = NULL;
				dma_fence_put(old);
				mutex_unlock(&ring->mutex);

				if (kthread_should_stop() || r) {
					// will call fence_ops.release directly to free the fence.
					dma_fence_put(&fence->job);
					continue;
				}

				// if overlap happened, there must be a congruences on seq and old->seqno,which means seq≡ old->seqno mod(num_hw_submission * 2)
				// this implies seq = q*(num_hw_submission * 2) + old->seqno. q=1 typically.
				diff = seq - old->seqno;
				printk("%s line %d, fence wokenup, wakeseqno %lld, new adding seq %d, slot %d, diff %d, waken interval %lld, latestseq %d, avail %d.\n",
				       __func__, __LINE__, old->seqno, seq, seq & ring->num_fences_mask, diff, old->seqno - oldwaitseqno,
				       ring->sync_seq, fencedrv_get_ring_avail());

				if (diff != num_hw_submission * 2) {
					pr_err("%s line %d, fatal error, diff not match totoal ring.\n",
					       __func__, __LINE__);
				}

				oldwaitseqno = old->seqno;
			}
		}

#if 0
		printk("%s line %d, fence emit, seqno %lld, seq %d, slot %d.\n",
		       __func__, __LINE__, fence->seqno, seq, seq & ring->num_fences_mask);
#endif
		rcu_assign_pointer(*ptr, dma_fence_get(&fence->job));

		// because no outer usage of fence, so put it here for free ok.
		dma_fence_put(&fence->job);
#endif
	}

	set_current_state(TASK_RUNNING);
	return 0;
}

void work_timer_fn(struct timer_list *timer)
{
	uint32_t seqno_next = 0;
	uint32_t seq, last_seq;
	int r;

	do {
		last_seq = atomic_read(&ring->last_seq);
		seq = *ring->cpu_addr;
	} while (atomic_cmpxchg(&ring->last_seq, last_seq, seq) != last_seq);

	if (unlikely(seq == last_seq)) {
		goto end;
	}

	assert(seq > last_seq);

	last_seq &= ring->num_fences_mask;
	seq &= ring->num_fences_mask;

	do {
		struct dma_fence *fence, **ptr;

		++last_seq;
		last_seq &= ring->num_fences_mask;
		ptr = &ring->fences[last_seq];
		fence = rcu_dereference_protected(*ptr, 1);

		RCU_INIT_POINTER(*ptr, NULL);
		if (!fence) {
			continue;
		}

		if (seqno_next == 0 || seqno_next == fence->seqno) {
			seqno_next = fence->seqno + 1;
		} else { /*if (seqno_next != 0 && seqno_next != fence->seqno)*/
			pr_err("%s line %d, seqno is not continue, exptect %d, actual %lld.\n",
			       __func__, __LINE__, seqno_next, fence->seqno);
		}

		r = dma_fence_signal(fence);
		if (r) {
			pr_err("%s line %d, fence already signaled.\n",
			       __func__, __LINE__);
			continue;
			//BUG();
		}

		dma_fence_put(fence);
	} while (last_seq != seq);
end:
	pr_err("%s line %d, work timer triggerd.\n", __func__, __LINE__);
	mod_timer(timer, jiffies + HZ / 10);
}

void gpu_process_thread(struct timer_list *timer)
{
	uint32_t seq, oldseq;

	seq = ring->sync_seq;
	oldseq = fence_seq;

	// trigger a job done on device.
	if (fence_seq == 0) {
		if (seq > 6)
			fence_seq = seq - 4;
	} else if ((seq - fence_seq) > 10) {
		fence_seq += (seq - fence_seq) / 2;
		assert(fence_seq > oldseq);
	}

	printk("%s line %d, timer trigger job, latest consume fence %d.\n",
	       __func__, __LINE__, fence_seq);

	mod_timer(timer, jiffies + HZ / 2);
}

static int fencedrv_wait_empty(void)
{
	uint64_t seq = READ_ONCE(ring->sync_seq);
	struct dma_fence *fence, **ptr;
	int r;

	if (!seq)
		return 0;

	fence_seq = seq;
	ptr = &ring->fences[seq & ring->num_fences_mask];
	rcu_read_lock();

	fence = rcu_dereference(*ptr);
	if (!fence || !dma_fence_get_rcu(fence)) {
		rcu_read_unlock();
		return 0;
	}
	rcu_read_unlock();

	r = dma_fence_wait(fence, false);

	printk("%s line %d, wait last fence %lld, seq %lld, r %d.\n", \
	       __func__, __LINE__, fence->seqno, seq, r);
	dma_fence_put(fence);

	return r;
}

static int __init fencedrv_init(void)
{
	if ((num_hw_submission & (num_hw_submission - 1)) != 0) {
		pr_err("%s line %d, num_hw_submission must be power of two.\n",
		       __func__, __LINE__);
		return -1;
	}

	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
	if (ring == NULL) {
		pr_err("%s line %d, alloc fence driver failure.\n",
		       __func__, __LINE__);
		return -ENOMEM;
	}

	// fence_seq is a snap shot of sync_seq for deal with fence batchlly.
	ring->cpu_addr = &fence_seq;
	ring->gpu_addr = (uint64_t)&fence_seq;
	ring->sync_seq = 0;
	atomic_set(&ring->last_seq, 0);
	ring->initialized = false;
	ring->initialized_emit = false;
	ring->initialized_recv = false;
	ring->last_fence = NULL;
	ring->num_fences_mask = num_hw_submission * 2 - 1;
	init_waitqueue_head(&ring->job_scheduled);

	spin_lock_init(&ring->lock);
	ring->fences = kcalloc(num_hw_submission * 2, sizeof(void *), GFP_KERNEL);
	if (!ring->fences) {
		pr_err("%s line %d, alloc fence buffer failure.\n",
		       __func__, __LINE__);
		return -ENOMEM;
	}

	printk("%s line %d, fence mask 0x%x, num_hw_submission 0x%x.\n",
	       __func__, __LINE__, ring->num_fences_mask, num_hw_submission);

	job_fence_slab = kmem_cache_create("job_fence_slab", sizeof(struct job_fence), 0,
	                                   SLAB_HWCACHE_ALIGN, NULL);
	if (!job_fence_slab) {
		pr_err("%s line %d, alloc job_fence_slab falure.\n",
		       __func__, __LINE__);
		return -ENOMEM;
	}

	sched_fence_slab = kmem_cache_create("sched_fence_slab", sizeof(struct fence_set), 0,
	                                     SLAB_HWCACHE_ALIGN, NULL);
	if (!sched_fence_slab) {
		pr_err("%s line %d, alloc sched_fence_slab falure.\n",
		       __func__, __LINE__);
		return -ENOMEM;
	}

	mutex_init(&ring->mutex);

	fence_emit_task = kthread_run(fence_emit_task_thread, NULL, "fence_emit");
	if (IS_ERR(fence_emit_task)) {
		pr_err("%s line %d, create fence emit tsk failure.\n",
		       __func__, __LINE__);
		return -1;
	}

	fence_recv_task = kthread_run(fence_recv_task_thread, NULL, "fence_recv");
	if (IS_ERR(fence_recv_task)) {
		pr_err("%s line %d, create fence recv tsk failure.\n",
		       __func__, __LINE__);
		return -1;
	}

	timer_setup(&ring->timer, gpu_process_thread, TIMER_IRQSAFE);
	add_timer(&ring->timer);
	mod_timer(&ring->timer, jiffies + HZ / 2);

	timer_setup(&ring->work_timer, work_timer_fn, TIMER_IRQSAFE);
	add_timer(&ring->work_timer);
	mod_timer(&ring->work_timer, jiffies + HZ / 10);

	printk("%s line %d, module init.\n", __func__, __LINE__);

	ring->initialized = true;
	ring->initialized_emit = true;
	ring->initialized_recv = true;

	wake_up_process(fence_emit_task);
	wake_up_process(fence_recv_task);

	return 0;
}

static void __exit fencedrv_exit(void)
{
	printk("%s line %d, module unload task begin.\n", __func__, __LINE__);

	del_timer_sync(&ring->work_timer);

	mutex_lock(&ring->mutex);
	if ((ring->last_fence != NULL) &&
	    (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &ring->last_fence->flags))) {
		ring->initialized_emit = false;
		dma_fence_signal(ring->last_fence);
		dma_fence_put(ring->last_fence);
	}
	mutex_unlock(&ring->mutex);

	kthread_stop(fence_emit_task);
	printk("%s line %d, module unload task mid.\n", __func__, __LINE__);

	del_timer_sync(&ring->timer);
	fencedrv_wait_empty();

	printk("%s line %d, sync wait avail %d.\n", __func__, __LINE__, fencedrv_get_ring_avail());

	wait_event_killable(ring->job_scheduled, fencedrv_get_ring_avail() <= 1);
	ring->initialized_recv = false;
	kthread_stop(fence_recv_task);

	printk("%s line %d, module unload task end.\n", __func__, __LINE__);

	ring->initialized = false;
	rcu_barrier();
	kmem_cache_destroy(job_fence_slab);
	kmem_cache_destroy(sched_fence_slab);
	kfree(ring->fences);
	kfree(ring);

	fence_emit_task = NULL;
	fence_recv_task = NULL;

	printk("%s line %d, module unload.\n", __func__, __LINE__);
}

module_init(fencedrv_init);
module_exit(fencedrv_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("czl");