Ubuntu Slurm部署安装

一、部署规划

hostname IP 用途 显卡 分区
wg-8-1 10.20.8.1 slum-master 4090 nvidia-4090
wg-4-11 10.20.4.11 slum-node A100 nvidia-A100
wg-4-12 10.20.4.12 slum-node A100 nvidia-A100
wg-4-13 10.20.4.13 slum-node A100 nvidia-A100
wg-4-14 10.20.4.14 slum-node A100 nvidia-A100
wg-4-15 10.20.4.15 slum-node A100 nvidia-A100
master节点部署如下组件:
  • mariadb或mysql
  • nfs
  • slurmdbd
  • slurmctl
  • slurmd
  • munge
  • slurm-web-agent
  • slurm-web-gateway

node节点部署:

  • slurmd
  • munge

二、Master操作

2.1 基础环境

# 生成ssh公私钥
root@wg-8-1:~# ssh-keygen

# 将其他节点hostname加入/etc/hosts
vi /etc/host
10.20.8.1 wg-8-1
10.20.4.11 wg-4-11
10.20.4.12 wg-4-12
10.20.4.13 wg-4-13
10.20.4.14 wg-4-14
10.20.4.15 wg-4-15

# 对其他节点做免密
ssh-copy-id root@wg-8-1
ssh-copy-id root@wg-4-11
ssh-copy-id root@wg-4-12
ssh-copy-id root@wg-4-13
ssh-copy-id root@wg-4-14
ssh-copy-id root@wg-4-15

#

2.2 安装munge

# 安装
apt install munge l -y
systemctl enable --now munge.service 

# 测试munge
root@wg-8-1:~# munge -n | unmunge
STATUS:          Success (0)
ENCODE_HOST:     wg-8-1 (10.20.8.1)
ENCODE_TIME:     2024-11-28 21:45:03 +0800 (1732801503)
DECODE_TIME:     2024-11-28 21:45:03 +0800 (1732801503)
TTL:             300
CIPHER:          aes128 (4)
MAC:             sha256 (5)
ZIP:             none (0)
UID:             root (0)
GID:             root (0)
LENGTH:          0

# 拷贝munge到其他节点
scp /etc/munge/munge.key root@wg-4-11:/etc/munge/
scp /etc/munge/munge.key root@wg-4-12:/etc/munge/
scp /etc/munge/munge.key root@wg-4-13:/etc/munge/
scp /etc/munge/munge.key root@wg-4-14:/etc/munge/
scp /etc/munge/munge.key root@wg-4-15:/etc/munge/

# 测试和其他机器的连通性
root@wg-8-1:~# munge -n | ssh wg-4-11 unmunge
STATUS:          Success (0)
ENCODE_HOST:     wg-8-1 (10.20.8.1)
ENCODE_TIME:     2024-11-28 21:52:07 +0800 (1732801927)
DECODE_TIME:     2024-11-28 21:52:07 +0800 (1732801927)
TTL:             300
CIPHER:          aes128 (4)
MAC:             sha256 (5)
ZIP:             none (0)
UID:             root (0)
GID:             root (0)
LENGTH:          0

2.3 安装NFS

# 安装
apt install nfs-kernel-server nfs-common -y

# 配置nfs
vi /etc/exports
/u01 *(rw,sync,no_root_squash)

# 使配置生效
exportfs -a

# 测试
root@wg-8-1:~# showmount -e 10.20.4.11
Export list for 10.20.4.11:
/u01 *

# 挂载
mount -t nfs 10.20.8.1:/u01 /nfs

2.4 安装配置数据库

# 安装
apt install mysql-server libmysql++-dev -y

# 配置
/etc/mysql/my.cnf

[mysqld]  
 innodb_buffer_pool_size=1024M  
 innodb_log_file_size=64M  
 innodb_lock_wait_timeout=900

# 启动&开机自启
systemctl enable --now mysql

# 设置用户
sudo mysql
create user 'slurm'@'localhost' identified by '123456';  

# 创建基础数据库  
create database slurm_acct_db;  

# 允许slurm用户访问  
grant all on slurm_acct_db.* TO 'slurm'@'localhost';  

# 创建另一个,这个不强制,但是建议,如果不创建,后面slurm配置需要改,所以还是创建吧  
create database slurm_job_db;  

grant all on slurm_job_db.* TO 'slurm'@'localhost';

2.5 编译安装slurm

https://www.schedmd.com/download-slurm/
# 下载slurm包
wget https://download.schedmd.com/slurm/slurm-24.05.4.tar.bz2

# 安装编译软件
apt-get install build-essential fakeroot devscripts equivs libswitch-perl -y

# 解压安装
tar -xaf slurm-24.05.4.tar.bz2

mkdir slurm && mv slurm-24.05.4 slurm/
cd slurm/slurm-24.05.4
mk-build-deps -i debian/control
debuild -b -uc -us

cd ../ && dpkg -i *.deb

2.6 配置slurm

在/etc/slurm 添加如下文件

cgroup.conf

# /etc/slurm/cgroup.conf
CgroupMountpoint=/sys/fs/cgroup

#the behavior of this particular plugin
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes

slurm.conf

ClusterName=nansha
SlurmctldHost=wg-8-1
GresTypes=gpu

ProctrackType=proctrack/cgroup
PrologFlags=Contain
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
StateSaveLocation=/var/spool/slurmctld
TaskPlugin=task/affinity,task/cgroup
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0

SchedulerType=sched/backfill
SelectType=select/cons_tres
AccountingStorageHost=localhost
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
#AuthAltTypes=auth/jwt
#AuthAltParameters=jwt_key=/var/spool/slurm/statesave/jwt_hs256.key
JobCompHost=localhost
JobCompLoc=slurm_job_db
JobCompPass=123456
JobCompPort=3306
JobCompType=jobcomp/mysql
JobCompUser=slurm
#JobContainerType=
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
NodeName=wg-8-1 Gres=gpu:4090:8 CPUs=128 RealMemory=463500 Sockets=2 CoresPerSocket=32 ThreadsPerCore=2 State=UNKNOWN
NodeName=wg-4-[11-15] Gres=gpu:A100:8 CPUs=128 RealMemory=927900 Sockets=2 CoresPerSocket=32 ThreadsPerCore=2 State=UNKNOWN

PartitionName=nvidia-A100 Nodes=wg-4-[11-15] Default=YES MaxTime=INFINITE State=UP
PartitionName=nvidia-4090 Nodes=wg-8-1,wg-8-6 Default=NO MaxTime=INFINITE State=UPault=NO MaxTime=INFINITE State=UP 

slurmdbd.conf

# /etc/slurm/slurmdbd.conf
ArchiveEvents=yes
ArchiveJobs=yes
ArchiveResvs=yes
ArchiveSteps=no
ArchiveSuspend=no
ArchiveTXN=no
ArchiveUsage=no
AuthInfo=/var/run/munge/munge.socket.2
AuthType=auth/munge
DbdHost=localhost
DebugLevel=info
PurgeEventAfter=1month
PurgeJobAfter=12month
PurgeResvAfter=1month
PurgeStepAfter=1month
PurgeSuspendAfter=1month
PurgeTXNAfter=12month
PurgeUsageAfter=24month
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
SlurmUser=slurm
StoragePass=123456
StorageType=accounting_storage/mysql
StorageUser=slurm
StorageHost=localhost
StoragePort=3306

gres.conf

根据实际情况填写

NodeName=wg-8-1 Name=gpu Type=4090 File=/dev/nvidia[0-7]
NodeName=wg-4-[11-15] Name=gpu Type=A100 File=/dev/nvidia[0-7]

配置好后更改文件权限

chmod 600 *.conf

2.7 启动master节点

systemctl enable --now slurmdbd.service 
systemctl enable --now slurmctld.service 
systemctl enable --now slurmd.service

systemctl restart slurmdbd.service
systemctl restart slurmctld.service
systemctl restart slurmd.service

三、Node节点

这里仅记录一台节点操作,其余节点操作一致

```shell
# 生成ssh公私钥
root@wg-4-11:~# ssh-keygen

# 将其他节点hostname加入/etc/hosts
vi /etc/host
10.20.8.1 wg-8-1
10.20.4.11 wg-4-11
10.20.4.12 wg-4-12
10.20.4.13 wg-4-13
10.20.4.14 wg-4-14
10.20.4.15 wg-4-15

# 对其他节点做免密
ssh-copy-id root@wg-8-1
ssh-copy-id root@wg-4-11
ssh-copy-id root@wg-4-12
ssh-copy-id root@wg-4-13
ssh-copy-id root@wg-4-14
ssh-copy-id root@wg-4-15

# 安装nfs客户端
apt install -y nfs-common

3.2 安装munge

参照2.2

3.3 挂载NFS

apt install -y nfs-common 
mkdir /nfs
mount -t nfs 10.20.8.1:/u01 /nfs

vi /etc/fstab
10.20.8.1:/u01 /nfs nfs  defaults,async   0 0

3.4 编译安装slurm

参照2.5

3.5 配置slurm

将2.6部文件拷贝至改节点 注:slurmdbd.conf不用拷贝 注:gres.conf 根据机器情况更改 注:其他文件不需要更改

chmod 600 slurm.conf 

3.6 启动&开机自启

注:node节点仅需启动slurmd即可

systemctl enable --now slurmd.service
systemctl restart slurmd.service

三、验证slurm集群

root@wg-8-1:~# sinfo
PARTITION    AVAIL  TIMELIMIT  NODES  STATE NODELIST
nvidia-A100     up   infinite      5   idle wg-4-[11-15]
nvidia-4090*    up   infinite      1   idle wg-8-1

注意:STATE为idle时集群可正常调度分发任务

四、slurm-web部署

官方文档:https://docs.rackslab.io/slurm-web/install/quickstart.html 按照官方文档安装即可

发表评论

您的邮箱地址不会被公开。 必填项已用 * 标注

Captcha Code