前文:
https://hknaruto.blog.csdn.net/article/details/130408240
测试程序
/**
tcti.cpp
参考:
https://www.cnblogs.com/organic/p/17321523.html
g++ -std=c++11 -lpthread trigger_cgroup_timer_inactive.cpp -o inactive_timer
./inactive_timer 100000 10000
*/
#include <errno.h>
#include <iostream>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#ifndef CPU_CORE_COUNT
#define CPU_CORE_COUNT 64
#endif
using namespace std;
std::string sub_cgroup_dir("/sys/fs/cgroup/cpu/test");
// common lib
bool is_dir(const std::string &path) {
struct stat statbuf;
if (stat(path.c_str(), &statbuf) == 0) {
if (0 != S_ISDIR(statbuf.st_mode)) {
return true;
}
}
return false;
}
bool write_file(const std::string &file_path, int num) {
// std::cout << file_path << " op:" << num << std::endl;
FILE *fp = fopen(file_path.c_str(), "w");
if (fp == NULL) {
return false;
}
// std::cout << file_path << " op:" << num << std::endl;
std::string write_data = to_string(num);
fputs(write_data.c_str(), fp);
fclose(fp);
return true;
}
std::string read_file(const std::string &file_path) {
FILE *fp = fopen(file_path.c_str(), "r");
if (NULL == fp) {
return "read error...\n";
}
char buff[512];
memset(buff, 0, 512);
fread(buff, 512, 1, fp);
fclose(fp);
return std::string(buff);
}
// ms
long get_ms_timestamp() {
timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000 + tv.tv_usec / 1000);
}
// cgroup
bool create_cgroup() {
if (is_dir(sub_cgroup_dir) == false) {
if (mkdir(sub_cgroup_dir.c_str(), S_IRWXU | S_IRGRP) != 0) {
cout << "mkdir cgroup dir fail" << endl;
return false;
}
}
int pid = getpid();
cout << "pid is " << pid << endl;
std::string procs_path = sub_cgroup_dir + "/cgroup.procs";
return write_file(procs_path, pid);
}
bool set_period(int period) {
std::string period_path = sub_cgroup_dir + "/cpu.cfs_period_us";
return write_file(period_path, period);
}
bool set_quota(int quota) {
std::string quota_path = sub_cgroup_dir + "/cpu.cfs_quota_us";
return write_file(quota_path, quota);
}
bool set_cpuOnline(int cpuId, int online) {
std::string cpuPath = std::string("/sys/devices/system/cpu/cpu") +
std::to_string(cpuId) + std::string("/online");
return write_file(cpuPath, online);
}
// thread
// param: ms interval
void *thread_func(void *param) {
int i = 0;
int interval = (long)param;
long last = get_ms_timestamp();
while (true) {
i++;
if (i % 100000 != 0) {
continue;
}
long current = get_ms_timestamp();
if ((current - last) >= interval) {
last = current;
}
}
pthread_exit(NULL);
}
void test_thread() {
const int k_thread_num = CPU_CORE_COUNT * 10;
pthread_t pthreads[k_thread_num];
for (int i = 0; i < k_thread_num; i++) {
if (pthread_create(&pthreads[i], NULL, thread_func, (void *)(i + 1)) != 0) {
cout << "create thread fail" << endl;
} else {
cout << "create thread success,tid is " << pthreads[i] << endl;
}
}
}
void *thread_cpu_online_ctl(void *param) {
int online, cpu;
bool b;
while (true) {
online = rand() % 2;
cpu = rand() % CPU_CORE_COUNT;
b = set_cpuOnline(cpu, online);
std::cout << cpu << " -> " << online << "result:" << b << std::endl;
std::cout << read_file("/sys/devices/system/cpu/online") << std::endl;
}
pthread_exit(NULL);
}
void cpu_ctl_thread() {
const int k_thread_num = CPU_CORE_COUNT;
pthread_t pthreads[k_thread_num];
for (int i = 0; i < k_thread_num; i++) {
if (pthread_create(&pthreads[i], NULL, thread_cpu_online_ctl,
(void *)(i + 1)) != 0) {
cout << "create thread fail" << endl;
} else {
cout << "create thread success,tid is " << pthreads[i] << endl;
}
}
}
int main(int argc, char *argv[]) {
int period = 100000;
int quota = CPU_CORE_COUNT * 0.8 * 100000;
cout << "period is " << period << endl;
cout << "quota is " << quota << endl;
srand(time(nullptr));
test_thread();
cpu_ctl_thread();
if (create_cgroup() == false) {
cout << "create cgroup fail" << endl;
return -1;
}
set_period(period);
set_quota(quota);
while (true) {
sleep(10000);
}
return 0;
}
编译
g++ -DCPU_CORE_COUNT=64 -lpthread -std=c++11 trigger_cgroup_timer_inactive.cpp -o tcti
FT2000+ kvm openEuler 22.03 LTS 64C64G(正常)
经历9次手动重启tcti进程,第10次持续运行一个晚上,未发生进程或者os卡死现象
距离开机65376秒,仍然正常(开机即开始测试)
说明此调度故障已在openEuer 22.03 LTS版本内核修复,内核版本如下
5.10.0-60.18.0.50.oe2203.aarch64
验证方案一:openEuler 20.03 LTS SP3 升级到22.03 LTS版本内核
直接从openEuler-22.03-LTS-everything-aarch64-dvd.iso:/Packages/kernel-source-5.10.0-60.18.0.50.oe2203.aarch64.rpm获取
scp方式拷贝rpm包20.03 kvm虚拟机
rpm安装
所有需要选择的地方,直接回车
make modules_install
make install
reboot
采用5.10.0版本内核启动
执行测试
9次手动重启,第10次知道故障发生
持续半小时,正常。
基本可以判断openEuler 5.10.0内核已修复此问题。
故障处理
flex: command not found biosn: comman not found
yum install -y bison flex
cannot resolve BTF IDs for CONFIG_DEBUG_INFO_BTF, please install libelf-dev, libelf-devel or elfutils-libelf-devel
yum install -y elfutils-libelf-devel
openssl/opensslv.h: no such file or directory
yum install -y openssl-devel
bc: command not found
yum install -y bc
Failed to generate BTF for vmlinux
vim .config
主持掉以下这行,保存重新编译
询问 DEBUG_INFO_BTF,输入n
no space left on device
删掉无用的文件,或者扩容虚拟盘
qemu-img resize gpt分区 parted修复分区信息 虚拟机 lvm 扩容根分区_hkNaruto的博客-CSDN博客
验证方案二:openEuler 20.03 LTS SP3 升级到官网linux-5.10.38.tar.xz
刚好本地有这个版本,都是5.10的大版本号
cd linux-5.10.38
cp /usr/src/linux-5.10.0-60.18.0.50.oe2203.aarch64/.config .
make -j64
make modules_install
make install
选择5.10.38版本启动
执行测试
约1小时后,故障发生,进程没有输出,也不能退出...
qemu-kvm进程CPU消耗也掉到0%
问题复现,说明openEuler版本内核有特别的处理。
故障
启动失败(Guest disabled display.)
make defconfig导致。
解决: