slurm部署安装

管理节点和计算节点

在所有节点执行

安装系统工具

apt install -y build-essential curl wget munge

将hosts文件

 vim /etc/hosts

xxx.xxx.xxx.xxx  xxx

分发到其他计算节点

创建slurm用户

useradd -m slurm
mkdir /var/spool/slurmd /var/spool/slurmctld
chown slurm:slurm /var/spool/slurmd /var/spool/slurmctld

配置munge（在管理节点执行）

 /usr/sbin/mungekey --create  #生成的文件在/etc/munge

 ll /etc/munge
-rw-------   1 munge munge  128 Nov 20 09:29 munge.key

将munge.key分发到其他计算节点
scp munge.key 10.250.2.232:/etc/munge/

在所有节点添加用户及用户组
chown munge:munge /etc/munge/munge.key
启动
systemctl start munge

配置slurmdbd

首先要有一个数据，我这里使用容器里面的MariaDB

 kubectl exec -it volador-database-deploy-5fb94cd59f-4mm5g -- /bin/bash
 
 MariaDB [(none)]> create user 'slurm'@'localhost' identified by '123456';
Query OK, 0 rows affected (0.024 sec)

MariaDB [(none)]> create database slurm_acct_db;
Query OK, 1 row affected (0.000 sec)

MariaDB [(none)]> grant all on slurm_acct_db.* TO 'slurm'@'localhost' identified by '123456' with grant option;
Query OK, 0 rows affected (0.013 sec)

MariaDB [(none)]> grant all on slurm_acct_db.* TO 'slurm'@'system0' identified by '123456' with grant option;
Query OK, 0 rows affected, 1 warning (0.004 sec)

MariaDB [(none)]> create database slurm_jobcomp_db;
Query OK, 1 row affected (0.001 sec)

MariaDB [(none)]> grant all on slurm_jobcomp_db.* TO 'slurm'@'localhost' identified by '123456' with grant option;
Query OK, 0 rows affected (0.016 sec)

MariaDB [(none)]> grant all on slurm_jobcomp_db.* TO 'slurm'@'system0' identified by '123456' with grant option;
Query OK, 0 rows affected, 1 warning (0.002 sec)

GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY '123456';
FLUSH PRIVILEGES;

部署MySQL

安装依赖环境

root@4090-208:/usr/local/mysql-5.7.18# apt-get install cmake bison libncurses5-dev gcc g++  libncurses5 -y

下载安装包

root@4090-208:/usr/local/mysql-5.7.18# wget https://dev.mysql.com/get/Downloads/MySQL-5.7/mysql-boost-5.7.18.tar.gz

root@4090-208:/usr/local/mysql-5.7.18# tar -zxf mysql-boost-5.7.18.tar.gz -C /usr/local/

创建用户用户组

root@4090-208:/usr/local/mysql-5.7.18# groupadd mysql
root@4090-208:/usr/local/mysql-5.7.18# useradd -g mysql mysql

编译安装MySQL

cmake \
-DCMAKE_INSTALL_PREFIX=/usr/local/mysql \
-DMYSQL_DATADIR=/usr/local/mysql/data \
-DWITH_BOOST=./boost/boost_1_59_0 \
-DSYSCONFDIR=/etc \
-DWITH_INNOBASE_STORAGE_ENGINE=1 \
-DWITH_PARTITION_STORAGE_ENGINE=1 \
-DWITH_FEDERATED_STORAGE_ENGINE=1 \
-DWITH_BLACKHOLE_STORAGE_ENGINE=1 \
-DWITH_MYISAM_STORAGE_ENGINE=1 \
-DWITH_MEMORY_STORAGE_ENGINE=1 \
-DENABLED_LOCAL_INFILE=1 \
-DWITH_READLINE=1 \
-DMYSQL_TCP_PORT=3306 \
-DEXTRA_CHARSETS=all \
-DDEFAULT_CHARSET=utf8 \
-DDEFAULT_COLLATION=utf8_general_ci


make -j4 && make install -j4

配置MySQL

设置MySQL用户权限

root@4090-208:/usr/local# chown -R mysql:mysql /usr/local/mysql

修改配置文件

vim /etc/my.cnf

[client]
port = 3306
socket = /tmp/mysql.sock

[mysqld]
character_set_server=utf8
init_connect='SET NAMES utf8'
basedir=/usr/local/mysql
datadir=/usr/local/mysql/data
socket=/tmp/mysql.sock
log-error=/var/log/mysqld.log
pid-file=/var/run/mysqld/mysqld.pid
lower_case_table_names = 1

sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION

max_connections=5000

default_time_zone='+8:00'

启动数据库

/usr/local/mysql/support-files/mysql.server start

将MySQL的客户端库添加环境变量

echo "export LD_LIBRARY_PATH=/usr/local/mysql/lib:$LD_LIBRARY_PATH" | sudo tee -a /etc/profile
source /etc/profile

系统默认会查找/usr/bin下的命令;建立一个链接文件。

ln -s /usr/local/mysql/bin/mysql /usr/bin

新建slurmdbd.conf文件

root@4090-208:~# vim /etc/slurm/slurmdbd.conf
#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info为启用slurmdbd的管理服务器，与slurm.conf中的AccountingStorageHost一致
DbdHost=4090-208 #<Slurm控制节点或容器IP>
#DbdBackupHost=mn02
DbdPort=6819
SlurmUser=root
MessageTimeout=30
DebugLevel=7
#DefaultQOS=normal,standby
LogFile=/var/log/slurmdbd.log
PidFile=/usr/local/slurm/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
PrivateData=jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=[IP]  #数据库容器的IP或主机名
#StorageBackupHost=mn02
StoragePort=3306
StoragePass=123456
StorageUser=root
StorageLoc=slurm_acct_db
CommitDelay=1

启动

/usr/local/slurm/sbin/slurmdbd

配置slurm(所有节点执行)

安装基本的 Debian 软件包构建要求：
apt-get install build-essential fakeroot devscripts equivs libmunge-dev

解压包

tar -xaf  slurm-24.05.4.tar.bz2

进到slurm源目录

安装 Slurm 包依赖项：
 mk-build-deps -i debian/control

管理节点执行

构建slurm包

 debuild -b -uc -us

编译安装slurm(所有节点执行)

./configure --prefix=/usr/local/slurm --sysconfdir=/etc/slurm

make -j$(nproc) && make install -j$(nproc)

在管理节点执行

root@4090-208:~# vim /etc/slurm/slurm.conf


# slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#


ControlMachine=4090-208 #<YOUR-HOST-NAME>


ControlAddr=[ip]
AuthType=auth/munge

AccountingStorageEnforce=associations,limits,qos
AccountingStorageHost=4090-208     # 主节点
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd

StateSaveLocation=/mnt/volume/slurm # 存储slurmctld服务状态的目录，如有备份控制节点，则需要所有SlurmctldHost节点都能共享读写该目录
SlurmdSpoolDir=/var/spool/slurmd
#AccountingStorageUser=
#AccountingStoreJobComment=Yes
AccountingStorageTRES=gres/gpu # 设置GPU时需要
GresTypes=gpu # 设置GPU时需要

AcctGatherEnergyType=acct_gather_energy/none
AcctGatherFilesystemType=acct_gather_filesystem/none
AcctGatherInterconnectType=acct_gather_interconnect/none
AcctGatherNodeFreq=0
#AcctGatherProfileType=acct_gather_profile/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux

#SlurmctldParameters=enable_configless   #如果不是无配置模式则注销此行
MpiDefault=none
ProctrackType=proctrack/linuxproc
ReturnToService=1
SlurmUser=root                      #slurm
SwitchType=switch/none
SchedulerType=sched/builtin
#SelectType=select/linear
SelectType=select/cons_tres
SelectTypeParameters= CR_CPU     #基于CPU调度
#SelectTypeParameters=CR_Core,CR_CORE_DEFAULT_DIST_BLOCK      #基于Core和内存调度
TaskPlugin=task/affinity        #够将一个或多个进程绑定到一个或多个处理器上运行
ClusterName=slurmcls #<YOUR-HOST-NAME>
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdLogFile=/var/log/slurmd.log
SlurmctldPort=6817    #slurmctld服务端口
SlurmdPort=6818        #slurmd服务的端口
MailProg=/var/spool/mail

NodeName=4090-[209-213] Gres=gpu:4090:8 CPUs=72 Sockets=2 CoresPerSocket=36 ThreadsPerCore=2 RealMemory=256000 State=UNKNOWN

PartitionName=cpu-node Nodes=ALL Default=YES MaxTime=INFINITE State=UP
PartitionName=gpu-node Nodes=ALL Default=NO MaxTime=INFINITE State=UP

查看配置

root@fm-manage-01:/etc/slurm# /usr/local/slurm/bin/scontrol show config | grep -i accounting

资源要按实际写

分发到不同计算节点
scp

启动slurm

在管理节点执行

root@4090-208:~# /usr/local/slurm/sbin/slurmctld

root@4090-208:~# ps -ef |grep slurm
root      285883       1  0 Dec12 ?        00:00:05 /usr/local/slurm/sbin/slurmdbd
root      286340       1  0 Dec12 ?        00:07:17 /usr/local/slurm/sbin/slurmctld
root      286341  286340  0 Dec12 ?        00:00:00 slurmctld: slurmscriptd
root      319327  298959  0 06:47 pts/0    00:00:00 grep --color=auto slurm

在计算节点执行

配置gres.conf（GPU）

root@4090-209:/etc/slurm# cat gres.conf 
Name=gpu Type=4090 File=/dev/nvidia0
Name=gpu Type=4090 File=/dev/nvidia1
Name=gpu Type=4090 File=/dev/nvidia2
Name=gpu Type=4090 File=/dev/nvidia3
Name=gpu Type=4090 File=/dev/nvidia4
Name=gpu Type=4090 File=/dev/nvidia5
Name=gpu Type=4090 File=/dev/nvidia6

Name=gpu Type=4090 File=/dev/nvidia7

root@4090-209:/usr/local/slurm/sbin# ./slurmd

root@ubuntu-27:/usr/local/slurm/sbin# ps -ef |grep slurmd
root     1221625       1  0 10:21 ?        00:00:00 ./slurmd
root     1271541 1134559  0 10:43 pts/0    00:00:00 grep --color=auto slurmd

在管理节点查看节点状态

设置状态
/usr/local/slurm/bin/scontrol update NodeName=4090-213 State=idle

root@4090-208:/etc/slurm# /usr/local/slurm/bin/sinfo 
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
cpu-node*    up   infinite      5   idle 4090-[209-213]
gpu-node     up   infinite      5   idle 4090-[209-213]

详细信息
root@4090-208:/etc/slurm# /usr/local/slurm/bin/scontrol show node 
NodeName=4090-209 Arch=x86_64 CoresPerSocket=36 
   CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:4090:8
   NodeAddr=4090-209 NodeHostName=4090-209 Version=24.05.4
   OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023 
   RealMemory=256000 AllocMem=0 FreeMem=1017990 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=cpu-node,gpu-node 
   BootTime=2024-12-10T12:07:15 SlurmdStartTime=2024-12-12T09:56:13
   LastBusyTime=2024-12-12T10:16:44 ResumeAfterTime=None
   CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
   AllocTRES=
   CurrentWatts=0 AveWatts=0
   

NodeName=4090-210 Arch=x86_64 CoresPerSocket=36 
   CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:4090:8
   NodeAddr=4090-210 NodeHostName=4090-210 Version=24.05.4
   OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023 
   RealMemory=256000 AllocMem=0 FreeMem=1017828 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=cpu-node,gpu-node 
   BootTime=2024-12-10T12:07:33 SlurmdStartTime=2024-12-12T09:56:13
   LastBusyTime=2024-12-12T10:16:44 ResumeAfterTime=None
   CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
   AllocTRES=
   CurrentWatts=0 AveWatts=0
   

NodeName=4090-211 Arch=x86_64 CoresPerSocket=36 
   CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:4090:8
   NodeAddr=4090-211 NodeHostName=4090-211 Version=24.05.4
   OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023 
   RealMemory=256000 AllocMem=0 FreeMem=1017826 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=cpu-node,gpu-node 
   BootTime=2024-12-10T12:07:14 SlurmdStartTime=2024-12-12T10:00:03
   LastBusyTime=2024-12-12T09:59:36 ResumeAfterTime=None
   CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
   AllocTRES=
   CurrentWatts=0 AveWatts=0
   

NodeName=4090-212 Arch=x86_64 CoresPerSocket=36 
   CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:4090:8
   NodeAddr=4090-212 NodeHostName=4090-212 Version=24.05.4
   OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023 
   RealMemory=256000 AllocMem=0 FreeMem=1017886 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=cpu-node,gpu-node 
   BootTime=2024-12-10T12:07:14 SlurmdStartTime=2024-12-12T10:00:24
   LastBusyTime=2024-12-12T10:00:24 ResumeAfterTime=None
   CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
   AllocTRES=
   CurrentWatts=0 AveWatts=0
   

NodeName=4090-213 Arch=x86_64 CoresPerSocket=36 
   CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.00
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:4090:8
   NodeAddr=4090-213 NodeHostName=4090-213 Version=24.05.4
   OS=Linux 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023 
   RealMemory=256000 AllocMem=0 FreeMem=1017887 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=cpu-node,gpu-node 
   BootTime=2024-12-10T12:07:15 SlurmdStartTime=2024-12-12T10:00:31
   LastBusyTime=2024-12-12T10:00:31 ResumeAfterTime=None
   CfgTRES=cpu=72,mem=250G,billing=72,gres/gpu=8
   AllocTRES=
   CurrentWatts=0 AveWatts=0

cpu+gpu同时调度


```bash
在这里插入代#!/bin/bash
#SBATCH --job-name=gpu_multi_node_job     # 作业名称
#SBATCH --output=gpu_multi_node_output_%j.log  # 作业输出日志
#SBATCH --error=gpu_multi_node_error_%j.log    # 作业错误日志
#SBATCH --ntasks=2                        # 任务数：请求 2 个任务
#SBATCH --cpus-per-task=4                 # 每个任务使用 4 个 CPU 核
#SBATCH --gres=gpu:1                      # 每个任务请求 1 个 GPU
#SBATCH --time=72:00:00                   # 最大运行时间：72 小时
#SBATCH --mem=32G                         # 请求内存：32GB
#SBATCH --nodes=2                         # 请求 2 个节点
#SBATCH --ntasks-per-node=1               # 每个节点 1 个任务
#SBATCH --partition=gpu-node                   # GPU 资源的分区名称

# 输出节点信息
echo "Job started on $(date)"
echo "Running on nodes: $(hostname)"

# 加载环境模块（如果需要）
# module load cuda/11.2
# module load python/3.8

pip install numpy torch
# 在每个节点上执行 GPU 计算任务
srun python3 /mnt/volume/slurm/long_gpu_task.py

echo "Job completed on $(date)"

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time

# 确保 GPU 可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if not torch.cuda.is_available():
    print("No GPU found, exiting...")
    exit()

# 模拟一些训练数据
num_samples = 1000
num_features = 100
X_train = torch.randn(num_samples, num_features).to(device)
y_train = torch.randint(0, 2, (num_samples,)).to(device)

# 构建一个简单的神经网络
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(num_features, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.sigmoid(self.layer3(x))
        return x

model = SimpleNN().to(device)

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 模拟训练过程
print("Training started...")
for epoch in range(50):  # 模拟训练 50 个 epoch
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs.squeeze(), y_train.float())
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/50, Loss: {loss.item():.4f}")

# 模拟一个长时间的计算任务
print("Training complete, starting long computation...")
time.sleep(1800)  # 模拟 1 小时的计算（可调整为需要的时间）

print("Task completed!")