一、主核心的介绍
1.三个核心在FREERTOS系统中相互独立,各负责各自的外设和程序;其中M0和LP核心在一个总线上,D0单独在一个总线上,两个总线使用AXI4.0(??)通讯?
CPU0(M0)-E907架构,320MHz;
CPU1(LP)-E902架构,160MHz;
CPU2(D0)-C906架构,480MHz;
2.CPU0(M0)先启动,然后再启动CPU2(C906),CPU1(e902);相关的代码在博流的SDK中位置如下:/../BOUFFALO_SDK/bsp/board/bl808dk/board.c;
#if defined(CPU_M0)
void board_init(void)
{
int ret = -1;
uintptr_t flag;
flag = bflb_irq_save();
GLB_Halt_CPU(GLB_CORE_ID_D0);
GLB_Halt_CPU(GLB_CORE_ID_LP);
ret = bflb_flash_init();
system_clock_init();
peripheral_clock_init();
bflb_irq_initialize();
console_init();
#ifdef CONFIG_PSRAM
#ifndef CONFIG_PSRAM_COPY_CODE
if (uhs_psram_init() < 0) {
while (1) {
}
}
#endif
#endif
size_t heap_len = ((size_t)&__HeapLimit - (size_t)&__HeapBase);
kmem_init((void *)&__HeapBase, heap_len);
bl_show_log();
if (ret != 0) {
printf("flash init fail!!!\r\n");
}
bl_show_flashinfo();
printf("dynamic memory init success,heap size = %d Kbyte \r\n", ((size_t)&__HeapLimit - (size_t)&__HeapBase) / 1024);
printf("sig1:%08x\r\n", BL_RD_REG(GLB_BASE, GLB_UART_CFG1));
printf("sig2:%08x\r\n", BL_RD_REG(GLB_BASE, GLB_UART_CFG2));
log_start();
#if (defined(CONFIG_LUA) || defined(CONFIG_BFLOG) || defined(CONFIG_FATFS))
rtc = bflb_device_get_by_name("rtc");
#endif
/* set CPU D0 boot XIP address and flash address */
Tzc_Sec_Set_CPU_Group(GLB_CORE_ID_D0, 1);
/* D0 boot from 0x58000000 */
GLB_Set_CPU_Reset_Address(GLB_CORE_ID_D0, 0x58000000);
/* D0 image offset on flash is CONFIG_D0_FLASH_ADDR+0x1000(header) */
bflb_sf_ctrl_set_flash_image_offset(CONFIG_D0_FLASH_ADDR + 0x1000, 1, SF_CTRL_FLASH_BANK0);
Tzc_Sec_Set_CPU_Group(GLB_CORE_ID_LP, 0);
/* LP boot from 0x58020000 */
GLB_Set_CPU_Reset_Address(GLB_CORE_ID_LP, 0x58020000);
bflb_irq_restore(flag);
GLB_Release_CPU(GLB_CORE_ID_D0);
GLB_Release_CPU(GLB_CORE_ID_LP);
/* release d0 and then do can run */
BL_WR_WORD(IPC_SYNC_ADDR1, IPC_SYNC_FLAG);
BL_WR_WORD(IPC_SYNC_ADDR2, IPC_SYNC_FLAG);
L1C_DCache_Clean_By_Addr(IPC_SYNC_ADDR1, 8);
}
其中按照顺序依次是
bflb_irq_save(); 关闭全局中断;
GLB_Halt_CPU(GLB_CORE_ID_D0); 关闭D0核心
GLB_Halt_CPU(GLB_CORE_ID_LP); 关闭LP低功耗核心;
bflb_flash_init(); 初始化FLASH;
system_clock_init(); 初始化时钟;
peripheral_clock_init(); 初始化外设时钟;
bflb_irq_initialize(); 初始化中断;
console_init(); 初始化串口;
uhs_psram_init(); 初始化内置的64M UHS_PSRAM ;
/* set CPU D0 boot XIP address and flash address */
Tzc_Sec_Set_CPU_Group(GLB_CORE_ID_D0, 1);
/* D0 boot from 0x58000000 */
GLB_Set_CPU_Reset_Address(GLB_CORE_ID_D0, 0x58000000);
/* D0 image offset on flash is CONFIG_D0_FLASH_ADDR+0x1000(header) */
bflb_sf_ctrl_set_flash_image_offset(CONFIG_D0_FLASH_ADDR + 0x1000, 1, SF_CTRL_FLASH_BANK0);
Tzc_Sec_Set_CPU_Group(GLB_CORE_ID_LP, 0);
/* LP boot from 0x58020000 */
GLB_Set_CPU_Reset_Address(GLB_CORE_ID_LP, 0x58020000);
bflb_irq_restore(flag);
GLB_Release_CPU(GLB_CORE_ID_D0);
GLB_Release_CPU(GLB_CORE_ID_LP);
/* release d0 and then do can run */
BL_WR_WORD(IPC_SYNC_ADDR1, IPC_SYNC_FLAG);
BL_WR_WORD(IPC_SYNC_ADDR2, IPC_SYNC_FLAG);
L1C_DCache_Clean_By_Addr(IPC_SYNC_ADDR1, 8);
3.内存划分,这部分比较抽象,MM内核指的是multi-media内核(手册上多次出现MM前缀的外设,也就是指C906独有的部分),也就是C906(d0)内核,MCU指的就是E907和E902共用的部分;均可使用直接地址访问。
4.C906的外设只能C906使用;E907(M0)的外设,E902(LP)核心也可以使用(因为他们本来就是在一条AHB总线上的)包括串口和IO之类的;默认的MCU部分有3个串口,UART0、UART1、UART2;C906只有一个串口,在系统中默认编号是UART3;
5.XRAM的大小是16K,地址为0x40000000,其设定的意义是让三个核心可以通过这个区域进行IPC通讯;
每个内核都有一组 IPC 的寄存器,包括 IPCx_TRI、IPCx_STS、IPCx_ACK、IPCx_IEN、IPCx_IDIS、IPCx_ISTS 共 6 个寄存器,这些寄存器的长度都是 32bits,每个 bit 都对应 IPC 的一个通道。核 M0、LP、D0 分别对应 IPC0、IPC1、IPC2。当一个核需要向另一个核发通知时,只需要向接收核的 IPCx_TRI 的对应通道写 1 即可,此时接收核的 IPCx_STS 的对应通道即被设置为 1,如果接收核的 IPC 对应通道的中断也被使能,则会收到一个中断,此时即获知了其他核发来的通知。
目前此部分还未做实验,看zhihu上有人实验,D0和M0是可以正常通讯的,好像和LP之间有问题,估计是中断没处理好。地址在这
6.coremark测试,使用官方自带的测试例程进行测试,M0的程序可以正常运行,D0的无法正常运行,待修复;已修复,M0实验结果如下:显示为1111,-O3优化,数据放在STACK中;
____ __ __ _ _ _
| _ \ / _|/ _| | | | | | |
| |_) | ___ _ _| |_| |_ __ _| | ___ | | __ _| |__
| _ < / _ \| | | | _| _/ _` | |/ _ \| |/ _` | '_ \
| |_) | (_) | |_| | | | || (_| | | (_) | | (_| | |_) |
|____/ \___/ \__,_|_| |_| \__,_|_|\___/|_|\__,_|_.__/
Build:13:43:42,Sep 29 2023
Copyright (c) 2022 Bouffalolab team
======== flash cfg ========
flash size 0x01000000
jedec id 0xEF4018
mid 0xEF
iomode 0x04
clk delay 0x01
clk invert 0x01
read reg cmd0 0x05
read reg cmd1 0x35
write reg cmd0 0x01
write reg cmd1 0x31
qe write len 0x01
cread support 0x00
cread code 0xFF
burst wrap cmd 0x77
===========================
dynamic memory init success,heap size = 21 Kbyte
sig1:ffff32ff
sig2:0000ffff
Benchmark started, please make sure it runs for at least 10s.
Now PC=58014b88
2K performance run parameters for coremark.
CoreMark Size : 666
Total ticks : 18067
Total time (secs): 18
Iterations/Sec : 1111
Iterations : 20000
Compiler version : GCC10.2.0
Compiler flags : -O3
Memory location : STACK
seedcrc : 0xe9f5
[0]crclist : 0xe714
[0]crcmatrix : 0x1fd7
[0]crcstate : 0x8e3a
[0]crcfinal : 0x382f
Correct operation validated. See readme.txt for run and reporting rules.
CoreMark 1.0 : 1111 / GCC10.2.0 -O3 / STACK
D0由于代码有问题,测试结果可能有问题,显示为1666,
____ __ __ _ _ _
| _ \ / _|/ _| | | | | | |
| |_) | ___ _ _| |_| |_ __ _| | ___ | | __ _| |__
| _ < / _ \| | | | _| _/ _` | |/ _ \| |/ _` | '_ \
| |_) | (_) | |_| | | | || (_| | | (_) | | (_| | |_) |
|____/ \___/ \__,_|_| |_| \__,_|_|\___/|_|\__,_|_.__/
Build:20:01:44,Sep 29 2023
Copyright (c) 2022 Bouffalolab team
dynamic memory init success,heap size = 59 Kbyte
sig1:ffff32ff
sig2:0000ffff
cgen1:9f7ffffd
Benchmark started, please make sure it runs for at least 10s.
Now PC=580104c2
2K performance run parameters for coremark.
CoreMark Size : 666
Total ticks : 12240
Total time (secs): 12
Iterations/Sec : 1666
Iterations : 20000
Compiler version : GCC10.2.0
Compiler flags : -O3
Memory location : Stack
seedcrc : 0xe9f5
[0]crclist : 0xe714
[0]crcmatrix : 0x1fd7
[0]crcstate : 0x8e3a
[0]crcfinal : 0x382f
Correct operation validated. See readme.txt for run and reporting rules.
CoreMark 1.0 : 1666 / GCC10.2.0 -O3 / Stack
LP的coremark暂时没搞定,跑出来只有2分,不确定是内存不足还是什么情况。
____ __ __ _ _ _
| _ \ / _|/ _| | | | | | |
| |_) | ___ _ _| |_| |_ __ _| | ___ | | __ _| |__
| _ < / _ \| | | | _| _/ _` | |/ _ \| |/ _` | '_ \
| |_) | (_) | |_| | | | || (_| | | (_) | | (_| | |_) |
|____/ \___/ \__,_|_| |_| \__,_|_|\___/|_|\__,_|_.__/
Build:15:26:15,Sep 29 2023
Copyright (c) 2022 Bouffalolab team
lp does not use memheap due to little ram
sig1:32ff76ff
sig2:0000ffff
cgen1:9f7ffffd
Benchmark started, please make sure it runs for at least 10s.
Now PC=580349bc
2K performance run parameters for coremark.
CoreMark Size : 666
Total ticks : 15729
Total time (secs): 15
Iterations/Sec : 2
Iterations : 30
Compiler version : GCC10.2.0
Compiler flags : -O3
Memory location : STACK
seedcrc : 0xe9f5
[0]crclist : 0xe714
[0]crcmatrix : 0x1fd7
[0]crcstate : 0x8e3a
[0]crcfinal : 0xf8b3
Correct operation validated. See readme.txt for run and reporting rules.
CoreMark 1.0 : 2 / GCC10.2.0 -O3 / STACK
二、应用例程探究
看到好多网络上的例程和官方例程,这里着重写几个有意思的;
1、模拟GBA游戏机
例程来源于这里,在这位的基础上增加了一个简单的查看FPS的小功能,其实官方仓库都有;这里做一个对比:ESP32-S3 在分辨率是256*160跳帧为1的情况下是20帧,我测试的BL808的C906在400Mhz,256*160分辨率下是33-37帧。而且后面的日志显示,此时C906主频为400Mhz;
Name | Tested hardware | Performance | Notes |
---|---|---|---|
ESP32-S3 | ESP32-S3-WROOM-1-N8R8 | 20 fps | frameskip: 1 |
SDL2 | AMD 3800X | 1800 fps | |
SDL2 | Switch | 314 fps | |
SDL2 | Apple M1 | 2300 fps | |
SDL2 | Vita | 131 fps | frameskip: 1, overclocked |
SDL1 | New 3DS | 111 fps | frameskip: 1, overclocked |
watchOS | Apple Watch Series 5 | 451 fps | Not public yet |
BL808_D0 | SIPEED M1S_DOCK | 33fps |
只需要按照原作者的文件,并进行一点点修改就行,增加两个freertos的头文件,增加一个时间获取的函数,计算一下生成120帧的时间,进行FPS平均值统计,结果是33-35FPS;看来对比ESP32-S3还是有点优势的,coremark从ESP32-S3的单核613,提升到BL808_D0的单核1666。
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "gba.h"
#include "globals.h"
#include "memory.h"
#include "sound.h"
/* aos */
#include <aos/kernel.h>
#include <vfs.h>
extern "C" {
#include "lcd.h"
#include <FreeRTOS.h>
#include <task.h>
}
uint16_t lcd_buff[256 * 160];
uint8_t frameDrawn = 0;
uint32_t frameCount = 0;
void systemDrawScreen(void)
{
frameDrawn = 1;
uint16_t *src = pix;
uint16_t *dst = lcd_buff;
for (int y = 0; y < 160; y++)
{
for (int x = 0; x < 256; x++)
{
*dst++ = __builtin_bswap16(*src++);
}
}
st7789v_spi_draw_picture_blocking(20, 40, 20+256-1, 40+160-1, lcd_buff);
}
void systemOnWriteDataToSoundBuffer(int16_t *finalWave, int length) {}
void systemMessage(const char *fmt, ...)
{
char buf[256];
va_list args;
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
printf("GBA: %s", buf);
}
extern "C" {
extern uint32_t bl808_key_read();
void emuMainLoop()
{
int fd = -1;
fd = aos_open("/flash/goodBoyAdv.gba", 0);
if(fd >= 0)
{
int model_bin_len = 0;
model_bin_len = aos_lseek(fd, 0, SEEK_END);
aos_lseek(fd, 0, SEEK_SET);
aos_read(fd, rom, model_bin_len);
aos_close(fd);
}
CPUSetupBuffers();
CPUInit(NULL, false);
CPUReset();
int prevTimeStamp = 0;
TickType_t fpsTick = xTaskGetTickCount();
while (1)
{
joy = bl808_key_read();
UpdateJoypad();
frameDrawn = 0;
while (!frameDrawn)
{
CPULoop();
}
frameCount++;
if (frameCount % 120 == 0) {
TickType_t now = xTaskGetTickCount();
int msPassed = (now - fpsTick) * portTICK_PERIOD_MS;
fpsTick = now;
int fps = 120 * 1000 / msPassed;
printf("FPS: %d\r\n", fps);
}
}
}
}
Starting bl808 now....
Heap Info: 29819 KB @ [0x0x00000000522e10f8 ~ 0x0x0000000054000000]
[OS] Starting aos_loop_proc task...
[OS] Start c906 xram handle...
[OS] Starting OS Scheduler...
init ring:0,tx:0x0000000022020140,rx:0x0000000000000000
init ring:2,tx:0x0000000022021340,rx:0x0000000022020340
init ring:3,tx:0x0000000022022540,rx:0x0000000022022340
init ring:4,tx:0x0000000022022840,rx:0x0000000022022740
init ring:5,tx:0x0000000000000000,rx:0x0000000000000000
Init CLI with event Driven
FPS: 36
FPS: 37
FPS: 36
FPS: 37
FPS: 37
FPS: 36
FPS: 37
FPS: 37
FPS: 36
FPS: 37
FPS: 36
FPS: 36
FPS: 36
FPS: 34
FPS: 35
FPS: 34
FPS: 34
FPS: 35
FPS: 35
FPS: 34
FPS: 35
FPS: 34
FPS: 34
FPS: 35