From e41009904e2346c71f3fe07fd2b671ec663b472f Mon Sep 17 00:00:00 2001 From: Stefano Alberto Russo <stefano.russo@gmail.com> Date: Mon, 29 Nov 2021 01:33:05 +0100 Subject: [PATCH] Added the standalone computing resource service with Podman, Docker and Singularity support. Improved demo slurm cluster naming. --- docker-compose-dev.yml | 21 +++++++---- rosetta/build | 1 + services/slurmbase/slurm.conf | 19 +++++----- services/standaloneworker/Dockerfile | 36 +++++++++++++++++++ services/standaloneworker/entrypoint.sh | 23 ++++++++++++ services/standaloneworker/keys/id_rsa.pub | 1 + services/standaloneworker/subgid | 1 + services/standaloneworker/subuid | 1 + .../rosetta/core_app/computing_managers.py | 6 ++-- .../management/commands/core_app_populate.py | 8 ++--- 10 files changed, 94 insertions(+), 23 deletions(-) create mode 100755 services/standaloneworker/Dockerfile create mode 100644 services/standaloneworker/entrypoint.sh create mode 100644 services/standaloneworker/keys/id_rsa.pub create mode 100644 services/standaloneworker/subgid create mode 100644 services/standaloneworker/subuid diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 16bc7b9..2e83dff 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -1,10 +1,10 @@ version: '3' services: - slurmclustermaster-main: + slurmclustermaster: image: "rosetta/slurmclustermaster" - container_name: slurmclustermaster-main - hostname: slurmclustermaster-main + container_name: slurmclustermaster + hostname: slurmclustermaster environment: - SAFEMODE=False privileged: true @@ -12,10 +12,10 @@ services: - ./data/shared:/shared # - ./data/singularity_cache:/rosetta/.singularity/cache # Not working, check permissions... - slurmclusterworker-one: + slurmclusterworker: image: "rosetta/slurmclusterworker" - container_name: slurmclusterworker-one - hostname: slurmclusterworker-one + container_name: slurmclusterworker + hostname: slurmclusterworker environment: - SAFEMODE=False privileged: true @@ -23,6 +23,15 @@ services: - ./data/shared:/shared - /var/run/docker.sock:/var/run/docker.sock + standaloneworker: + image: "rosetta/standaloneworker" + container_name: standaloneworker + hostname: standaloneworker + privileged: true + volumes: + - ./data/shared:/shared + - /var/run/docker.sock:/var/run/docker.sock + dregistry: container_name: dregistry hostname: dregistry diff --git a/rosetta/build b/rosetta/build index 10dae55..4d4b0e1 100755 --- a/rosetta/build +++ b/rosetta/build @@ -34,6 +34,7 @@ if [[ "x$SERVICE" == "x" ]] ; then $BUILD_COMMAND services/slurmcluster -t rosetta/slurmcluster $BUILD_COMMAND services/slurmclustermaster -t rosetta/slurmclustermaster $BUILD_COMMAND services/slurmclusterworker -t rosetta/slurmclusterworker + $BUILD_COMMAND services/standaloneworker -t rosetta/standaloneworker $BUILD_COMMAND services/dregistry -t rosetta/dregistry $BUILD_COMMAND services/webapp -t rosetta/webapp $BUILD_COMMAND services/postgres -t rosetta/postgres diff --git a/services/slurmbase/slurm.conf b/services/slurmbase/slurm.conf index 74bb787..a2c46d1 100755 --- a/services/slurmbase/slurm.conf +++ b/services/slurmbase/slurm.conf @@ -2,7 +2,7 @@ # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # -ControlMachine=slurmclustermaster-main +ControlMachine=slurmclustermaster #ControlAddr= #BackupController= #BackupAddr= @@ -155,16 +155,15 @@ SlurmdLogFile=/var/log/slurm-llnl/slurmd.log #SuspendRate= #SuspendTime= # -# Must add controller node explictly but don't place it into any partition -NodeName=slurmclustermaster-main CPUs=1 State=UNKNOWN -#NodeName=partitiona-instrument CPUs=1 State=UNKNOWN -#NodeName=partitionb-instrument CPUs=1 State=UNKNOWN -#NodeName=cris-instrument CPUs=1 State=UNKNOWN +# Must add controller node explicitly but don't place it into any partition +NodeName=slurmclustermaster CPUs=1 State=UNKNOWN +# # COMPUTE NODES -NodeName=slurmclusterworker-one CPUs=1 State=UNKNOWN -#NodeName=slurmclusterworker-two CPUs=1 State=UNKNOWN -PartitionName=partition1 Nodes=slurmclusterworker-one MaxTime=INFINITE State=UP -#PartitionName=partition2 Nodes=slurmclusterworker-two MaxTime=INFINITE State=UP +NodeName=slurmclusterworker CPUs=1 State=UNKNOWN +#NodeName=slurmclusterworker-multi-one CPUs=1 State=UNKNOWN +#NodeName=slurmclusterworker-multi-two CPUs=1 State=UNKNOWN +PartitionName=partition1 Nodes=slurmclusterworker MaxTime=INFINITE State=UP +#PartitionName=partition2 Nodes=slurmclusterworker-multi-one,slurmclusterworker-multi-two MaxTime=INFINITE State=UP diff --git a/services/standaloneworker/Dockerfile b/services/standaloneworker/Dockerfile new file mode 100755 index 0000000..66c9815 --- /dev/null +++ b/services/standaloneworker/Dockerfile @@ -0,0 +1,36 @@ +FROM quay.io/podman/stable:v3.2.3 + +# This is necessary due to some base image permission errors. +RUN chown -R podman:podman /home/podman + +# Change user +RUN usermod -l testuser podman +RUN usermod -d /home/testuser testuser +RUN ln -s /home/podman /home/testuser +RUN groupmod -n testuser podman + +# Replace uid/gid mapping from podman to testuser user +COPY subuid /etc/subuid +COPY subgid /etc/subgid + +#RUN dnf repolist +#RUN dnf update --refresh +RUN dnf install -y docker singularity openssh-server +RUN ssh-keygen -A +RUN mkdir /home/testuser/.ssh +COPY keys/id_rsa.pub /home/testuser/.ssh/authorized_keys +RUN dnf install -y python wget + + +#---------------------- +# Entrypoint +#---------------------- + +# Copy entrypoint +COPY entrypoint.sh / + +# Give right permissions +RUN chmod 755 /entrypoint.sh + +# Set entrypoint +ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/services/standaloneworker/entrypoint.sh b/services/standaloneworker/entrypoint.sh new file mode 100644 index 0000000..fd2f047 --- /dev/null +++ b/services/standaloneworker/entrypoint.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Exit on any error. More complex thing could be done in future +# (see https://stackoverflow.com/questions/4381618/exit-a-script-on-error) +set -e + +# Fix FUSE permissions +chmod 777 /dev/fuse + +#--------------------- +# Entrypoint command +#--------------------- + +if [[ "x$@" == "x" ]] ; then + echo -n "[INFO] Executing Docker entrypoint command: /usr/sbin/sshd -D" + /usr/sbin/sshd -D +else + ENTRYPOINT_COMMAND=$@ + echo -n "[INFO] Executing Docker entrypoint command: " + echo $ENTRYPOINT_COMMAND + exec "$ENTRYPOINT_COMMAND" +fi +#exec sudo -i -u testuser /bin/bash -c "$ENTRYPOINT_COMMAND" diff --git a/services/standaloneworker/keys/id_rsa.pub b/services/standaloneworker/keys/id_rsa.pub new file mode 100644 index 0000000..9a0504b --- /dev/null +++ b/services/standaloneworker/keys/id_rsa.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC2n4wiLiRmE1sla5+w0IW3wwPW/mqhhkm7IyCBS+rGTgnts7xsWcxobvamNdD6KSLNnjFZbBb7Yaf/BvWrwQgdqIFVU3gRWHYzoU6js+lKtBjd0e2DAVGivWCKEkSGLx7zhx7uH/Jt8kyZ4NaZq0p5+SFHBzePdR/1rURd8G8+G3OaCPKqP+JQT4RMUQHC5SNRJLcK1piYdmhDiYEyuQG4FlStKCWLCXeUY2EVirNMeQIfOgbUHJsVjH07zm1y8y7lTWDMWVZOnkG6Ap5kB+n4l1eWbslOKgDv29JTFOMU+bvGvYZh70lmLK7Hg4CMpXVgvw5VF9v97YiiigLwvC7wasBHaASwH7wUqakXYhdGFxJ23xVMSLnvJn4S++4L8t8bifRIVqhT6tZCPOU4fdOvJKCRjKrf7gcW/E33ovZFgoOCJ2vBLIh9N9ME0v7tG15JpRtgIBsCXwLcl3tVyCZJ/eyYMbc3QJGsbcPGb2CYRjDbevPCQlNavcMdlyrNIke7VimM5aW8OBJKVh5wCNRpd9XylrKo1cZHYxu/c5Lr6VUZjLpxDlSz+IuTn4VE7vmgHNPnXdlxRKjLHG/FZrZTSCWFEBcRoSa/hysLSFwwDjKd9nelOZRNBvJ+NY48vA8ixVnk4WAMlR/5qhjTRam66BVysHeRcbjJ2IGjwTJC5Q== rosetta@rosetta.platform diff --git a/services/standaloneworker/subgid b/services/standaloneworker/subgid new file mode 100644 index 0000000..171e002 --- /dev/null +++ b/services/standaloneworker/subgid @@ -0,0 +1 @@ +testuser:10000:5000 \ No newline at end of file diff --git a/services/standaloneworker/subuid b/services/standaloneworker/subuid new file mode 100644 index 0000000..171e002 --- /dev/null +++ b/services/standaloneworker/subuid @@ -0,0 +1 @@ +testuser:10000:5000 \ No newline at end of file diff --git a/services/webapp/code/rosetta/core_app/computing_managers.py b/services/webapp/code/rosetta/core_app/computing_managers.py index 37d1406..3660ddf 100644 --- a/services/webapp/code/rosetta/core_app/computing_managers.py +++ b/services/webapp/code/rosetta/core_app/computing_managers.py @@ -288,7 +288,7 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana binds += ' -v{}:{}'.format(expanded_base_path, expanded_bind_path) # TODO: remove this hardcoding - prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + prefix = 'sudo' if computing_host == 'slurmclusterworker' else '' run_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} '.format(computing_keys.private_key_file, computing_user, computing_host) run_command += '/bin/bash -c \'"rm -rf /tmp/{}_data && mkdir /tmp/{}_data && chmod 700 /tmp/{}_data && '.format(task.uuid, task.uuid, task.uuid) @@ -334,7 +334,7 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana internal_stop_command = 'kill -9 {}'.format(task.id) elif container_runtime=='docker': # TODO: remove this hardcoding - prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + prefix = 'sudo' if computing_host == 'slurmclusterworker' else '' internal_stop_command = '{} docker stop {} && {} docker rm {}'.format(prefix,task.id,prefix,task.id) else: raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) @@ -368,7 +368,7 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana internal_view_log_command = 'cat /tmp/{}_data/task.log'.format(task.uuid) elif container_runtime=='docker': # TODO: remove this hardcoding - prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + prefix = 'sudo' if computing_host == 'slurmclusterworker' else '' internal_view_log_command = '{} docker logs {}'.format(prefix,task.id) else: raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) diff --git a/services/webapp/code/rosetta/core_app/management/commands/core_app_populate.py b/services/webapp/code/rosetta/core_app/management/commands/core_app_populate.py index b3ac3be..754565b 100644 --- a/services/webapp/code/rosetta/core_app/management/commands/core_app_populate.py +++ b/services/webapp/code/rosetta/core_app/management/commands/core_app_populate.py @@ -275,11 +275,11 @@ to provide help, news and informations on your deployment. Or you can just ignor access_mode = 'ssh+cli', auth_mode = 'user_keys', wms = None, - conf = {'host': 'slurmclusterworker-one'}, - container_runtimes = ['singularity']) + conf = {'host': 'standaloneworker'}, + container_runtimes = ['singularity','podman']) # Add testuser extra conf for this computing resource - testuser.profile.add_extra_conf(conf_type = 'computing_user', object=demo_singlenode_computing, value= 'slurmtestuser') + testuser.profile.add_extra_conf(conf_type = 'computing_user', object=demo_singlenode_computing, value= 'testuser') # Demo cluster computing plus conf demo_slurm_computing = Computing.objects.create(name = 'Demo Cluster', @@ -288,7 +288,7 @@ to provide help, news and informations on your deployment. Or you can just ignor access_mode = 'ssh+cli', auth_mode = 'user_keys', wms = 'slurm', - conf = {'host': 'slurmclustermaster-main', 'default_partition': 'partition1'}, + conf = {'host': 'slurmclustermaster', 'default_partition': 'partition1'}, container_runtimes = ['singularity']) # Add testuser extra conf for this computing resource -- GitLab