Skip to content

Commit

Permalink
Merge slurm sim lite into this repo
Browse files Browse the repository at this point in the history
  • Loading branch information
dvalters committed Nov 29, 2024
1 parent 0aa2242 commit 634061e
Show file tree
Hide file tree
Showing 17 changed files with 1,123 additions and 1 deletion.
File renamed without changes.
2 changes: 1 addition & 1 deletion docker-compose.yml → docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
services:
slurm-mira:
image: dvalts/slurm-sim-cirrus
image: decvalts/slurm_sim_lite
container_name: slurmsim
hostname: slurmsim
build:
Expand Down
132 changes: 132 additions & 0 deletions docker/slurm_sim_lite/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
FROM centos:7

LABEL desc="Slurm simulator made ready"

#Adding in the code from github to be able to start/stop mysql (and sshd?)
COPY cmd_start /sbin/
COPY cmd_stop /sbin/


# giving permissions to use the cmd from above
RUN \
chmod a+rwx /sbin/cmd_start && \
chmod a+rwx /sbin/cmd_stop && \
mkdir /install_files && \
useradd -d /home/slurm -ms /bin/bash slurm && \
usermod -aG wheel slurm && \
echo "slurm:slurm"|chpasswd && \
echo "Added slurm user" && \
yum -y install git && \
yum clean all

# getting file that installs all the R packages
COPY ./package_install.R /install_files

# creating all the directories needed for larger run command
USER slurm
RUN \
cd /home/slurm && \
mkdir slurm_sim_ws && \
cd slurm_sim_ws && \
mkdir sim && \
cd /home/slurm/slurm_sim_ws && \
git clone https://github.com/ubccr-slurm-simulator/slurm_sim_tools.git

USER root


# installing mysql (mariadb), python, and R, setting everything up all in one Run command
RUN \
yum -y install mariadb-server && \
yum -y install mariadb-devel && \
echo "Done installing Mariadb" && \
yum -y install gcc-c++ && \
yum -y install install epel-release && \
yum -y install python36 python36-libs python36-devel python36-numpy python36-scipy python36-pip

RUN \
pip3 install pymysql && \
pip3 install pandas && \
echo "Python all installed" && \
yum -y install R R-Rcpp R-Rcpp-devel && \
yum -y install python-devel && \
yum -y install texlive-* && \
echo "R all installed" && \
Rscript /install_files/package_install.R && \
echo "Installed R packages" && \
yum -y install sudo && \
yum -y install wget && \
echo "Sudo, git, wget installed" && \
wget https://download2.rstudio.org/server/centos6/x86_64/rstudio-server-rhel-1.2.5042-x86_64.rpm && \
sudo yum -y install rstudio-server-rhel-1.2.5042-x86_64.rpm && \
yum -y install initscripts && \
echo "Rstudio server installed" && \
yum -y install openssh openssh-server openssh-clients openssl-libs && \
mkdir /var/run/sshd && \
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' && \
ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -N '' && \
ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N '' && \
echo "Ssh installed" && \
chmod g+rw /var/lib/mysql /var/log/mariadb /var/run/mariadb && \
mysql_install_db && \
chown -R mysql:mysql /var/lib/mysql && \
cmd_start mysqld && \
mysql -e "create user 'slurm'@'localhost' identified by 'slurm';" && \
mysql -e "GRANT ALL PRIVILEGES ON *.* TO 'slurm'@'localhost' IDENTIFIED BY 'slurm';" && \
cmd_stop mysqld && \
yum clean all


# switch to slurm user so the next directories made are owned by slurm
USER slurm

# installing slurm simulator
RUN \
cd /home/slurm/slurm_sim_ws && \
git clone https://github.com/ubccr-slurm-simulator/slurm_simulator.git && \
cd slurm_simulator && \
cd .. && \
mkdir bld_opt && \
cd bld_opt && \
../slurm_simulator/configure --prefix=/home/slurm/slurm_sim_ws/slurm_opt --enable-simulator \
--enable-pam --without-munge --enable-front-end --with-mysql-config=/usr/bin/ --disable-debug \
CFLAGS="-g -O3 -D NDEBUG=1" && \
make -j install


# 8787 is the default port that rstudio server uses, so need to expose it to use it
EXPOSE 8787


USER root


COPY ./startup_file.sh /install_files
COPY ./initial_test.sh /install_files
COPY ./micro_cluster_setup.py /install_files
COPY ./micro_ws_config.sh /install_files
COPY ./populate_slurmdb.sh /install_files
COPY ./generate_job_trace.sh /install_files
COPY ./run_sim.sh /install_files
COPY ./check_results.R /install_files


# need to expose port 22 to allow for ssh to work properly
EXPOSE 22


# expose for mysql use
EXPOSE 3306


# back to root for easier permissions stuff
RUN \
chmod -R a+rwx /install_files


# sets cmd_start as entrypoint, then runs the startup file and the initial test file
ENTRYPOINT ["/sbin/cmd_start"]
CMD ["/install_files/startup_file.sh","/install_files/initial_test.sh"]



84 changes: 84 additions & 0 deletions docker/slurm_sim_lite/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Slurm Simulator Docker and Apptainer Containers

```bash
docker build -f docker/slurm_sim/slurm_sim.Dockerfile -t nsimakov/slurm_sim:v3.0 .

# debug mode
# older docker uses --invoke
# BUILDX_EXPERIMENTAL=1 docker buildx build --invoke /bin/bash -f docker/slurm_sim/slurm_sim.Dockerfile -t nsimakov/slurm_sim:v3.0 .
BUILDX_EXPERIMENTAL=1 docker buildx debug build -f docker/slurm_sim/slurm_sim.Dockerfile -t nsimakov/slurm_sim:v3.0 .

# run
docker run -p 0.0.0.0:8888:8888 -it --rm \
--name slurmsim -h slurmsim \
-v $PWD/tutorials:/home/jovyan/work -v $PWD:/opt/slurm_sim_tools \
-v /tmp/.X11-unix:/tmp/.X11-unix -v /mnt/wslg:/mnt/wslg \
-e DISPLAY=$DISPLAY -e WAYLAND_DISPLAY=$WAYLAND_DISPLAY \
-e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR -e PULSE_SERVER=$PULSE_SERVER \
nsimakov/slurm_sim:v3.0

docker run -it -v /tmp/.X11-unix:/tmp/.X11-unix \
-e DISPLAY=$DISPLAY -e WAYLAND_DISPLAY=$WAYLAND_DISPLAY \
-e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR -e PULSE_SERVER=$PULSE_SERVER xclock

docker run -p 0.0.0.0:8888:8888 -it --rm \
--name slurmsim -h slurmsim \
-v $PWD/tutorials:/home/jovyan/work -v $PWD:/opt/slurm_sim_tools \
-v /tmp/.X11-unix:/tmp/.X11-unix -v /mnt/wslg:/mnt/wslg \
-e DISPLAY=$DISPLAY -e WAYLAND_DISPLAY=$WAYLAND_DISPLAY \
-e XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR -e PULSE_SERVER=$PULSE_SERVER \
-e NB_USER="slurm" \
-e NB_UID="1000" \
-e NB_GROUP="slurm" \
-e NB_GID="1000" \
-e CHOWN_HOME=yes \
nsimakov/slurm_sim:v3.0 bash


```
Done running hooks in: /usr/local/bin/start-notebook.d
Update jovyan's UID:GID to 1000:100
Running hooks in: /usr/local/bin/before-notebook.d as uid: 0 gid: 0
Done running hooks in: /usr/local/bin/before-notebook.d


in container run following to set time:

```
Run 'dpkg-reconfigure tzdata' if you wish to change it.
```

# Slurm Simulator Docker and Singularity Containers v1.2

```bash
docker run -p 0.0.0.0:8888:8888 -it --rm -h slurmsim nsimakov/slurm_sim:v3.0

```

Building
## from repo root
docker build -f ./docker/slurm_sim/Dockerfile -t nsimakov/ub-slurm-sim:v1.2 .
docker push nsimakov/ub-slurm-sim:v1.2
Running


## Singularity Installation

```bash
export VERSION=1.18 OS=linux ARCH=amd64 && \ # Replace the values as needed
wget https://dl.google.com/go/go$VERSION.$OS-$ARCH.tar.gz && \ # Downloads the required Go package
sudo tar -C /usr/local -xzvf go$VERSION.$OS-$ARCH.tar.gz && \ # Extracts the archive
rm go$VERSION.$OS-$ARCH.tar.gz # Deletes the ``tar`` file

echo 'export PATH=/usr/local/go/bin:$PATH' >> ~/.bashrc && \
source ~/.bashrc

export VERSION=3.9.7 && # adjust this as necessary \
wget https://github.com/sylabs/singularity/releases/download/v${VERSION}/singularity-ce-${VERSION}.tar.gz && \
tar -xzf singularity-ce-${VERSION}.tar.gz && \
cd singularity-ce-${VERSION}
./mconfig && \
make -C builddir && \
sudo make -C builddir install

```
71 changes: 71 additions & 0 deletions docker/slurm_sim_lite/check_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env Rscript

# This script gets the results from the simulation and runs some tests on them
# It tests if the requested features were given to the jobs
# Features: cpu type, gpu, big mem
# How implemented - each feature corresponds to a different type of node


library(RSlurmSimTools) # needs these libraries to run the tests
library(dplyr)

# this function allows easier comparison to see if the simulator assigned things correctly
# checks the trace value (requested feature) against the sacct value (Node List, indicating assigning of a feature)
# the check values are there for reuse of the function for more than one type of test
check_nodes <- function(df.joined, row_num, trace_col, trace_check, sacct_col, sacct_check){
result = TRUE # assumes correct
# df.joined is the joined data frame from trace and sacct data frames
trace_val = df.joined[row_num, trace_col] # trace value (feature)
sacct_val = df.joined[row_num, sacct_col] # sim value (if implemented feature)

# no feature requested if the value is NA, so check for that
if(!is.na(trace_val))
{
# check if the feature (trace_check) was requested
if(trace_val == trace_check)
{
# checks if the node list has the node corresponding to that feature
if(!(grepl(sacct_check, sacct_val)))
{
# if improper nodes have been assigned, its a false result (didn't assign properly)
result = FALSE
}
}
}
result # result is returned
}


# reads in the csv file of the job traces (jobs submitted)
job_trace <- read.csv(file="/home/slurm/slurm_sim_ws/slurm_sim_tools/reg_testing/micro_cluster/test_trace.csv")

# reads in log file of resulting data (what jobs were assigned, where, etc)
sacct_base <- read_sacct_out("/home/slurm/slurm_sim_ws/sim/micro/baseline/results/jobcomp.log")

# creating a joined data frame by job id so that can go through jobs easier
joined <- left_join(job_trace, sacct_base, by = c("sim_job_id" = "local_job_id") )

done_well = TRUE # assumes did correctly

# loops through each row in the joined data frame
for(row in 1:nrow(joined))
{
# checks if all features have been met (or weren't present)
done_well = check_nodes(joined, row, "sim_req_mem", 500000, "NodeList", "b") && # big mem
check_nodes(joined, row, "sim_features", "CPU-M", "NodeList", "m") && # M cpu
check_nodes(joined, row, "sim_features", "CPU-N", "NodeList", "n") && # N cpu
check_nodes(joined, row, "sim_gres", "gpu:1", "NodeList", "g") && # 1 gpu
check_nodes(joined, row, "sim_gres", "gpu:2", "NodeList", "g") # 2 gpu

# if at any point a feature doesn't match, breaks out of the loop
if(!done_well)
{
# prints out the job id for tracing back what failed
jobid = joined[row, "sim_job_id"]
print(paste("Id of incorrectly assigned job:", jobid))
break
}
}
# prints overall result
print("Did the simulator do well?.....")
print(done_well)
Loading

0 comments on commit 634061e

Please sign in to comment.