一、部署规划
hostname | IP | 用途 | 显卡 | 分区 |
---|---|---|---|---|
wg-8-1 | 10.20.8.1 | slum-master | 4090 | nvidia-4090 |
wg-4-11 | 10.20.4.11 | slum-node | A100 | nvidia-A100 |
wg-4-12 | 10.20.4.12 | slum-node | A100 | nvidia-A100 |
wg-4-13 | 10.20.4.13 | slum-node | A100 | nvidia-A100 |
wg-4-14 | 10.20.4.14 | slum-node | A100 | nvidia-A100 |
wg-4-15 | 10.20.4.15 | slum-node | A100 | nvidia-A100 |
master节点部署如下组件: |
- mariadb或mysql
- nfs
- slurmdbd
- slurmctl
- slurmd
- munge
- slurm-web-agent
- slurm-web-gateway
node节点部署:
- slurmd
- munge
二、Master操作
2.1 基础环境
# 生成ssh公私钥
root@wg-8-1:~# ssh-keygen
# 将其他节点hostname加入/etc/hosts
vi /etc/host
10.20.8.1 wg-8-1
10.20.4.11 wg-4-11
10.20.4.12 wg-4-12
10.20.4.13 wg-4-13
10.20.4.14 wg-4-14
10.20.4.15 wg-4-15
# 对其他节点做免密
ssh-copy-id root@wg-8-1
ssh-copy-id root@wg-4-11
ssh-copy-id root@wg-4-12
ssh-copy-id root@wg-4-13
ssh-copy-id root@wg-4-14
ssh-copy-id root@wg-4-15
#
2.2 安装munge
# 安装
apt install munge l -y
systemctl enable --now munge.service
# 测试munge
root@wg-8-1:~# munge -n | unmunge
STATUS: Success (0)
ENCODE_HOST: wg-8-1 (10.20.8.1)
ENCODE_TIME: 2024-11-28 21:45:03 +0800 (1732801503)
DECODE_TIME: 2024-11-28 21:45:03 +0800 (1732801503)
TTL: 300
CIPHER: aes128 (4)
MAC: sha256 (5)
ZIP: none (0)
UID: root (0)
GID: root (0)
LENGTH: 0
# 拷贝munge到其他节点
scp /etc/munge/munge.key root@wg-4-11:/etc/munge/
scp /etc/munge/munge.key root@wg-4-12:/etc/munge/
scp /etc/munge/munge.key root@wg-4-13:/etc/munge/
scp /etc/munge/munge.key root@wg-4-14:/etc/munge/
scp /etc/munge/munge.key root@wg-4-15:/etc/munge/
# 测试和其他机器的连通性
root@wg-8-1:~# munge -n | ssh wg-4-11 unmunge
STATUS: Success (0)
ENCODE_HOST: wg-8-1 (10.20.8.1)
ENCODE_TIME: 2024-11-28 21:52:07 +0800 (1732801927)
DECODE_TIME: 2024-11-28 21:52:07 +0800 (1732801927)
TTL: 300
CIPHER: aes128 (4)
MAC: sha256 (5)
ZIP: none (0)
UID: root (0)
GID: root (0)
LENGTH: 0
2.3 安装NFS
# 安装
apt install nfs-kernel-server nfs-common -y
# 配置nfs
vi /etc/exports
/u01 *(rw,sync,no_root_squash)
# 使配置生效
exportfs -a
# 测试
root@wg-8-1:~# showmount -e 10.20.4.11
Export list for 10.20.4.11:
/u01 *
# 挂载
mount -t nfs 10.20.8.1:/u01 /nfs
2.4 安装配置数据库
# 安装
apt install mysql-server libmysql++-dev -y
# 配置
/etc/mysql/my.cnf
[mysqld]
innodb_buffer_pool_size=1024M
innodb_log_file_size=64M
innodb_lock_wait_timeout=900
# 启动&开机自启
systemctl enable --now mysql
# 设置用户
sudo mysql
create user 'slurm'@'localhost' identified by '123456';
# 创建基础数据库
create database slurm_acct_db;
# 允许slurm用户访问
grant all on slurm_acct_db.* TO 'slurm'@'localhost';
# 创建另一个,这个不强制,但是建议,如果不创建,后面slurm配置需要改,所以还是创建吧
create database slurm_job_db;
grant all on slurm_job_db.* TO 'slurm'@'localhost';
2.5 编译安装slurm
https://www.schedmd.com/download-slurm/
# 下载slurm包
wget https://download.schedmd.com/slurm/slurm-24.05.4.tar.bz2
# 安装编译软件
apt-get install build-essential fakeroot devscripts equivs libswitch-perl -y
# 解压安装
tar -xaf slurm-24.05.4.tar.bz2
mkdir slurm && mv slurm-24.05.4 slurm/
cd slurm/slurm-24.05.4
mk-build-deps -i debian/control
debuild -b -uc -us
cd ../ && dpkg -i *.deb
2.6 配置slurm
在/etc/slurm 添加如下文件
cgroup.conf
# /etc/slurm/cgroup.conf
CgroupMountpoint=/sys/fs/cgroup
#the behavior of this particular plugin
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
slurm.conf
ClusterName=nansha
SlurmctldHost=wg-8-1
GresTypes=gpu
ProctrackType=proctrack/cgroup
PrologFlags=Contain
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
StateSaveLocation=/var/spool/slurmctld
TaskPlugin=task/affinity,task/cgroup
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0
SchedulerType=sched/backfill
SelectType=select/cons_tres
AccountingStorageHost=localhost
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
#AuthAltTypes=auth/jwt
#AuthAltParameters=jwt_key=/var/spool/slurm/statesave/jwt_hs256.key
JobCompHost=localhost
JobCompLoc=slurm_job_db
JobCompPass=123456
JobCompPort=3306
JobCompType=jobcomp/mysql
JobCompUser=slurm
#JobContainerType=
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
NodeName=wg-8-1 Gres=gpu:4090:8 CPUs=128 RealMemory=463500 Sockets=2 CoresPerSocket=32 ThreadsPerCore=2 State=UNKNOWN
NodeName=wg-4-[11-15] Gres=gpu:A100:8 CPUs=128 RealMemory=927900 Sockets=2 CoresPerSocket=32 ThreadsPerCore=2 State=UNKNOWN
PartitionName=nvidia-A100 Nodes=wg-4-[11-15] Default=YES MaxTime=INFINITE State=UP
PartitionName=nvidia-4090 Nodes=wg-8-1,wg-8-6 Default=NO MaxTime=INFINITE State=UPault=NO MaxTime=INFINITE State=UP
slurmdbd.conf
# /etc/slurm/slurmdbd.conf
ArchiveEvents=yes
ArchiveJobs=yes
ArchiveResvs=yes
ArchiveSteps=no
ArchiveSuspend=no
ArchiveTXN=no
ArchiveUsage=no
AuthInfo=/var/run/munge/munge.socket.2
AuthType=auth/munge
DbdHost=localhost
DebugLevel=info
PurgeEventAfter=1month
PurgeJobAfter=12month
PurgeResvAfter=1month
PurgeStepAfter=1month
PurgeSuspendAfter=1month
PurgeTXNAfter=12month
PurgeUsageAfter=24month
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
SlurmUser=slurm
StoragePass=123456
StorageType=accounting_storage/mysql
StorageUser=slurm
StorageHost=localhost
StoragePort=3306
gres.conf
根据实际情况填写
NodeName=wg-8-1 Name=gpu Type=4090 File=/dev/nvidia[0-7]
NodeName=wg-4-[11-15] Name=gpu Type=A100 File=/dev/nvidia[0-7]
配置好后更改文件权限
chmod 600 *.conf
2.7 启动master节点
systemctl enable --now slurmdbd.service
systemctl enable --now slurmctld.service
systemctl enable --now slurmd.service
systemctl restart slurmdbd.service
systemctl restart slurmctld.service
systemctl restart slurmd.service
三、Node节点
这里仅记录一台节点操作,其余节点操作一致
```shell
# 生成ssh公私钥
root@wg-4-11:~# ssh-keygen
# 将其他节点hostname加入/etc/hosts
vi /etc/host
10.20.8.1 wg-8-1
10.20.4.11 wg-4-11
10.20.4.12 wg-4-12
10.20.4.13 wg-4-13
10.20.4.14 wg-4-14
10.20.4.15 wg-4-15
# 对其他节点做免密
ssh-copy-id root@wg-8-1
ssh-copy-id root@wg-4-11
ssh-copy-id root@wg-4-12
ssh-copy-id root@wg-4-13
ssh-copy-id root@wg-4-14
ssh-copy-id root@wg-4-15
# 安装nfs客户端
apt install -y nfs-common
3.2 安装munge
参照2.2
3.3 挂载NFS
apt install -y nfs-common
mkdir /nfs
mount -t nfs 10.20.8.1:/u01 /nfs
vi /etc/fstab
10.20.8.1:/u01 /nfs nfs defaults,async 0 0
3.4 编译安装slurm
参照2.5
3.5 配置slurm
将2.6部文件拷贝至改节点 注:slurmdbd.conf不用拷贝 注:gres.conf 根据机器情况更改 注:其他文件不需要更改
chmod 600 slurm.conf
3.6 启动&开机自启
注:node节点仅需启动slurmd即可
systemctl enable --now slurmd.service
systemctl restart slurmd.service
三、验证slurm集群
root@wg-8-1:~# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
nvidia-A100 up infinite 5 idle wg-4-[11-15]
nvidia-4090* up infinite 1 idle wg-8-1
注意:STATE为idle时集群可正常调度分发任务
四、slurm-web部署
官方文档:https://docs.rackslab.io/slurm-web/install/quickstart.html 按照官方文档安装即可