slurm部署安装
管理节点和计算节点
在所有节点执行
安装系统工具
apt install -y build-essential curl wget munge
将hosts文件
vim /etc/hosts
xxx.xxx.xxx.xxx xxx
分发到其他计算节点
创建slurm用户
useradd -m slurm
mkdir /var/spool/slurmd /var/spool/slurmctld
chown slurm:slurm /var/spool/slurmd /var/spool/slurmctld
配置munge(在管理节点执行)
/usr/sbin/mungekey --create #生成的文件在/etc/munge
ll /etc/munge
-rw------- 1 munge munge 128 Nov 20 09:29 munge.key
将munge.key分发到其他计算节点
scp munge.key 10.250.2.232:/etc/munge/
在所有节点添加用户及用户组
chown munge:munge /etc/munge/munge.key
启动
systemctl start munge
配置slurmdbd
首先要有一个数据,我这里使用容器里面的MariaDB
kubectl exec -it volador-database-deploy-5fb94cd59f-4mm5g -- /bin/bash
MariaDB [(none)]> create user 'slurm'@'localhost' identified by '123456';
Query OK, 0 rows affected (0.024 sec)
MariaDB [(none)]> create database slurm_acct_db;
Query OK, 1 row affected (0.000 sec)
MariaDB [(none)]> grant all on slurm_acct_db.* TO 'slurm'@'localhost' identified by '123456' with grant option;
Query OK, 0 rows affected (0.013 sec)
MariaDB [(none)]> grant all on slurm_acct_db.* TO 'slurm'@'system0' identified by '123456' with grant option;
Query OK, 0 rows affected, 1 warning (0.004 sec)
MariaDB [(none)]> create database slurm_jobcomp_db;
Query OK, 1 row affected (0.001 sec)
MariaDB [(none)]> grant all on slurm_jobcomp_db.* TO 'slurm'@'localhost' identified by '123456' with grant option;
Query OK, 0 rows affected (0.016 sec)
MariaDB [(none)]> grant all on slurm_jobcomp_db.* TO 'slurm'@'system0' identified by '123456' with grant option;
Query OK, 0 rows affected, 1 warning (0.002 sec)
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY '123456';
FLUSH PRIVILEGES;
部署MySQL
安装依赖环境
root@4090-208:/usr/local/mysql-5.7.18# apt-get install cmake bison libncurses5-dev gcc g++ libncurses5 -y
下载安装包
root@4090-208:/usr/local/mysql-5.7.18# wget https://dev.mysql.com/get/Downloads/MySQL-5.7/mysql-boost-5.7.18.tar.gz
root@4090-208:/usr/local/mysql-5.7.18# tar -zxf mysql-boost-5.7.18.tar.gz -C /usr/local/
创建用户用户组
root@4090-208:/usr/local/mysql-5.7.18# groupadd mysql
root@4090-208:/usr/local/mysql-5.7.18# useradd -g mysql mysql
编译安装MySQL
cmake \
-DCMAKE_INSTALL_PREFIX=/usr/local/mysql \
-DMYSQL_DATADIR=/usr/local/mysql/data \
-DWITH_BOOST=./boost/boost_1_59_0 \
-DSYSCONFDIR=/etc \
-DWITH_INNOBASE_STORAGE_ENGINE=1 \
-DWITH_PARTITION_STORAGE_ENGINE=1 \
-DWITH_FEDERATED_STORAGE_ENGINE=1 \
-DWITH_BLACKHOLE_STORAGE_ENGINE=1 \
-DWITH_MYISAM_STORAGE_ENGINE=1 \
-DWITH_MEMORY_STORAGE_ENGINE=1 \
-DENABLED_LOCAL_INFILE=1 \
-DWITH_READLINE=1 \
-DMYSQL_TCP_PORT=3306 \
-DEXTRA_CHARSETS=all \
-DDEFAULT_CHARSET=utf8 \
-DDEFAULT_COLLATION=utf8_general_ci
make -j4 && make install -j4
配置MySQL
设置MySQL用户权限
root@4090-208:/usr/local# chown -R mysql:mysql /usr/local/mysql
修改配置文件
vim /etc/my.cnf
[client]
port = 3306
socket = /tmp/mysql.sock
[mysqld]
character_set_server=utf8
init_connect='SET NAMES utf8'
basedir=/usr/local/mysql
datadir=/usr/local/mysql/data
socket=/tmp/mysql.sock
log-error=/var/log/mysqld.log
pid-file=/var/run/mysqld/mysqld.pid
lower_case_table_names = 1
sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION
max_connections=5000
default_time_zone='+8:00'
启动数据库
/usr/local/mysql/support-files/mysql.server start
将MySQL的客户端库添加环境变量
echo "export LD_LIBRARY_PATH=/usr/local/mysql/lib:$LD_LIBRARY_PATH" | sudo tee -a /etc/profile
source /etc/profile
系统默认会查找/usr/bin下的命令;建立一个链接文件。
ln -s /usr/local/mysql/bin/mysql /usr/bin
新建slurmdbd.conf文件
root@4090-208:~# vim /etc/slurm/slurmdbd.conf
#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info为启用slurmdbd的管理服务器,与slurm.conf中的AccountingStorageHost一致
DbdHost=4090-208 #<Slurm控制节点或容器IP>
#DbdBackupHost=mn02
DbdPort=6819
SlurmUser=root
MessageTimeout=30
DebugLevel=7
#DefaultQOS=normal,standby
LogFile=/var/log/slurmdbd.log
PidFile=/usr/local/slurm/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
PrivateData=jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=[IP] #数据库容器的IP或主机名
#StorageBackupHost=mn02
StoragePort=3306
StoragePass=123456
StorageUser=root
StorageLoc=slurm_acct_db
CommitDelay=1
启动
/usr/local/slurm/sbin/slurmdbd
配置slurm(所有节点执行)
安装基本的 Debian 软件包构建要求:
apt-get install build-essential fakeroot devscripts equivs libmunge-dev
解压包
tar -xaf slurm-24.05.4.tar.bz2
进到slurm源目录
安装 Slurm 包依赖项:
mk-build-deps -i debian/control
管理节点执行
构建slurm包
debuild -b -uc -us
编译安装slurm(所有节点执行)
./configure --prefix=/usr/local/slurm --sysconfdir=/etc/slurm
make -j$(nproc) && make install -j$(nproc)
在管理节点执行
root@4090-208:~# vim /etc/slurm/slurm.conf
# slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=4090-208 #<YOUR-HOST-NAME>
ControlAddr=[ip]
AuthType=auth/munge
AccountingStorageEnforce=associations,limits,qos
AccountingStorageHost=4090-208 # 主节点
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
StateSaveLocation=/mnt/volume/slurm # 存储slurmctld服务状态的目录,如有备份控制节点,则需要所有SlurmctldHost节点都能共享读写该目录
SlurmdSpoolDir=/var/spool/slurmd
#AccountingStorageUser=
#AccountingStoreJobComment=Yes
AccountingStorageTRES=gres/gpu # 设置GPU时需要
GresTypes=gpu # 设置GPU时需要
AcctGatherEnergyType=acct_gather_energy/none
AcctGatherFilesystemType=acct_gather_filesystem/none
AcctGatherInterconnectType=acct_gather_interconnect/none
AcctGatherNodeFreq=0
#AcctGatherProfileType=acct_gather_profile/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
#SlurmctldParameters=enable_configless #如果不是无配置模式则注销此行
MpiDefault=none
ProctrackType=proctrack/linuxproc
ReturnToService=1
SlurmUser=root #slurm
SwitchType=switch/none
SchedulerType=sched/builtin
#SelectType=select/linear
SelectType=select/cons_tres
SelectTypeParameters= CR_CPU #基于CPU调度
#SelectTypeParameters=CR_Core,CR_CORE_DEFAULT_DIST_BLOCK #基于Core和内存调度
TaskPlugin=task/affinity #够将一个或多个进程绑定到一个或多个处理器上运行
ClusterName=slurmcls #<YOUR-HOST-NAME>
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdLogFile=/var/log/slurmd.log
SlurmctldPort=6817 #slurmctld服务端口
SlurmdPort=6818 #slurmd服务的端口
MailProg=/var/spool/mail
NodeName=4090-[209-213] Gres=gpu:4090:8 CPUs=72 Sockets=2 CoresPerSocket=36 ThreadsPerCore=2 RealMemory=256000 State=UNKNOWN
PartitionName=cpu-node Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=gpu-node Nodes=ALL Default=NO MaxTime=INFINITE State=UP
查看配置
root@fm-manage-01:/etc/slurm# /usr/local/slurm/bin/scontrol show config | grep -i accounting
资源要按实际写
分发到不同计算节点
scp
启动slurm
在管理节点执行
root@4090-208:~# /usr/local/slurm/sbin/slurmctld
root@4090-208:~# ps -ef |grep slurm
root 285883 1 0 Dec12 ? 00:00:05 /usr/local/slurm/sbin/slurmdbd
root 286340 1 0 Dec12 ? 00:07:17 /usr/local/slurm/sbin/slurmctld
root 286341 286340 0 Dec12 ? 00:00:00 slurmctld: slurmscriptd
root 319327 298959 0 06:47 pts/0 00:00:00 grep --color=auto slurm
在计算节点执行
配置gres.conf(GPU)
root@4090-209:/etc/slurm# cat gres.conf
Name=gpu Type=4090 File=/dev/nvidia0
Name=gpu Type=4090 File=/dev/nvidia1
Name=gpu Type=4090 File=/dev/nvidia2
Name=gpu Type=4090 File=/dev/nvidia3
Name=gpu Type=4090 File=/dev/nvidia4
Name=gpu Type=4090 File=/dev/nvidia5
Name=gpu Type=4090 File=/dev/nvidia6
Name=gpu Type=4090 File=/dev/nvidia7
root@4090-209:/usr/local/slurm/sbin# ./slurmd
root@ubuntu-27:/usr/local/slurm/sbin# ps -ef |grep slurmd
root 1221625 1 0 10:21 ? 00:00:00 ./slurmd
root 1271541 1134559 0 10:43 pts/0 00:00:00 grep --color=auto slurmd
在管理节点查看节点状态
设置状态
/usr/local/slurm/bin/scontrol update NodeName=4090-213 State=idle
root@4090-208:/etc/slurm# /usr/local/slurm/bin/sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
cpu-node* up infinite 5 idle 4090-[209-213]
gpu-node up infinite 5 idle 4090-[209-213]
详细信息
root@4090-208:/etc/slurm# /usr/local/slurm/bin/scontrol show node
NodeName=4090-209 Arch=x86_64 CoresPerSocket=36
CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:4090:8
NodeAddr=4090-209 NodeHostName=4090-209 Version=24.05.4
OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023
RealMemory=256000 AllocMem=0 FreeMem=1017990 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cpu-node,gpu-node
BootTime=2024-12-10T12:07:15 SlurmdStartTime=2024-12-12T09:56:13
LastBusyTime=2024-12-12T10:16:44 ResumeAfterTime=None
CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
AllocTRES=
CurrentWatts=0 AveWatts=0
NodeName=4090-210 Arch=x86_64 CoresPerSocket=36
CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:4090:8
NodeAddr=4090-210 NodeHostName=4090-210 Version=24.05.4
OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023
RealMemory=256000 AllocMem=0 FreeMem=1017828 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cpu-node,gpu-node
BootTime=2024-12-10T12:07:33 SlurmdStartTime=2024-12-12T09:56:13
LastBusyTime=2024-12-12T10:16:44 ResumeAfterTime=None
CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
AllocTRES=
CurrentWatts=0 AveWatts=0
NodeName=4090-211 Arch=x86_64 CoresPerSocket=36
CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:4090:8
NodeAddr=4090-211 NodeHostName=4090-211 Version=24.05.4
OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023
RealMemory=256000 AllocMem=0 FreeMem=1017826 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cpu-node,gpu-node
BootTime=2024-12-10T12:07:14 SlurmdStartTime=2024-12-12T10:00:03
LastBusyTime=2024-12-12T09:59:36 ResumeAfterTime=None
CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
AllocTRES=
CurrentWatts=0 AveWatts=0
NodeName=4090-212 Arch=x86_64 CoresPerSocket=36
CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:4090:8
NodeAddr=4090-212 NodeHostName=4090-212 Version=24.05.4
OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023
RealMemory=256000 AllocMem=0 FreeMem=1017886 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cpu-node,gpu-node
BootTime=2024-12-10T12:07:14 SlurmdStartTime=2024-12-12T10:00:24
LastBusyTime=2024-12-12T10:00:24 ResumeAfterTime=None
CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
AllocTRES=
CurrentWatts=0 AveWatts=0
NodeName=4090-213 Arch=x86_64 CoresPerSocket=36
CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:4090:8
NodeAddr=4090-213 NodeHostName=4090-213 Version=24.05.4
OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023
RealMemory=256000 AllocMem=0 FreeMem=1017887 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cpu-node,gpu-node
BootTime=2024-12-10T12:07:15 SlurmdStartTime=2024-12-12T10:00:31
LastBusyTime=2024-12-12T10:00:31 ResumeAfterTime=None
CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
AllocTRES=
CurrentWatts=0 AveWatts=0
cpu+gpu同时调度
```bash
在这里插入代#!/bin/bash
#SBATCH --job-name=gpu_multi_node_job # 作业名称
#SBATCH --output=gpu_multi_node_output_%j.log # 作业输出日志
#SBATCH --error=gpu_multi_node_error_%j.log # 作业错误日志
#SBATCH --ntasks=2 # 任务数:请求 2 个任务
#SBATCH --cpus-per-task=4 # 每个任务使用 4 个 CPU 核
#SBATCH --gres=gpu:1 # 每个任务请求 1 个 GPU
#SBATCH --time=72:00:00 # 最大运行时间:72 小时
#SBATCH --mem=32G # 请求内存:32GB
#SBATCH --nodes=2 # 请求 2 个节点
#SBATCH --ntasks-per-node=1 # 每个节点 1 个任务
#SBATCH --partition=gpu-node # GPU 资源的分区名称
# 输出节点信息
echo "Job started on $(date)"
echo "Running on nodes: $(hostname)"
# 加载环境模块(如果需要)
# module load cuda/11.2
# module load python/3.8
pip install numpy torch
# 在每个节点上执行 GPU 计算任务
srun python3 /mnt/volume/slurm/long_gpu_task.py
echo "Job completed on $(date)"
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
# 确保 GPU 可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if not torch.cuda.is_available():
print("No GPU found, exiting...")
exit()
# 模拟一些训练数据
num_samples = 1000
num_features = 100
X_train = torch.randn(num_samples, num_features).to(device)
y_train = torch.randint(0, 2, (num_samples,)).to(device)
# 构建一个简单的神经网络
class SimpleNN(nn.Module):
def __init__(self):
super(SimpleNN, self).__init__()
self.layer1 = nn.Linear(num_features, 64)
self.layer2 = nn.Linear(64, 32)
self.layer3 = nn.Linear(32, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = torch.relu(self.layer1(x))
x = torch.relu(self.layer2(x))
x = self.sigmoid(self.layer3(x))
return x
model = SimpleNN().to(device)
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 模拟训练过程
print("Training started...")
for epoch in range(50): # 模拟训练 50 个 epoch
optimizer.zero_grad()
outputs = model(X_train)
loss = criterion(outputs.squeeze(), y_train.float())
loss.backward()
optimizer.step()
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}/50, Loss: {loss.item():.4f}")
# 模拟一个长时间的计算任务
print("Training complete, starting long computation...")
time.sleep(1800) # 模拟 1 小时的计算(可调整为需要的时间)
print("Task completed!")