diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 166832edcf430981c109d71077d520726d94150d..16bc7b9b3612c48414947cba4e14aaa8be399205 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -21,6 +21,7 @@ services: privileged: true volumes: - ./data/shared:/shared + - /var/run/docker.sock:/var/run/docker.sock dregistry: container_name: dregistry diff --git a/services/slurmclusterworker/Dockerfile b/services/slurmclusterworker/Dockerfile index f72bb5e303e8770822e3816e6d8f06215d6e648d..c98c874420c8e05f2993b6d4e5947cbad4fb4dac 100755 --- a/services/slurmclusterworker/Dockerfile +++ b/services/slurmclusterworker/Dockerfile @@ -1,2 +1,8 @@ FROM rosetta/slurmcluster MAINTAINER Stefano Alberto Russo <stefano.russo@gmail.com> + +# Docker +RUN apt-get install docker.io -y + +# Add slurmtestuser user to sudoers +RUN adduser slurmtestuser sudo diff --git a/services/webapp/code/rosetta/core_app/api.py b/services/webapp/code/rosetta/core_app/api.py index 859559713d78173da2bee5ed844697215bcc3032..1e9494c63ff1e52a3f27ff5d335ccf738dbd26f3 100644 --- a/services/webapp/code/rosetta/core_app/api.py +++ b/services/webapp/code/rosetta/core_app/api.py @@ -323,18 +323,34 @@ print(port) logger.info('Setting task "{}" to ip "{}" and port "{}"'.format(task.uuid, task_interface_ip, task_interface_port)) task.status = TaskStatuses.running task.interface_ip = task_interface_ip - if task.container.supports_custom_interface_port: + + # Get container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + if container_runtime=='singularity': + # For Singularity, set this only if the container supports custom interface ports + if task.container.supports_custom_interface_port: + task.interface_port = int(task_interface_port) + else: + # For all other container runtimes, set it in any case task.interface_port = int(task_interface_port) + + # Save the task task.save() # Notify the user that the task called back home - logger.info('Sending task ready mail notification to "{}"'.format(task.user.email)) - mail_subject = 'Your Task "{}" is now starting up'.format(task.container.name) - mail_text = 'Hello,\n\nyour Task "{}" on {} is now starting up. Check logs or connect here: https://{}/tasks/?uuid={}\n\nThe Rosetta notifications bot.'.format(task.container.name, task.computing, settings.ROSETTA_HOST, task.uuid) - try: - send_email(to=task.user.email, subject=mail_subject, text=mail_text) - except Exception as e: - logger.error('Cannot send task ready email: "{}"'.format(e)) + if settings.DJANGO_EMAIL_APIKEY: + logger.info('Sending task ready mail notification to "{}"'.format(task.user.email)) + mail_subject = 'Your Task "{}" is now starting up'.format(task.container.name) + mail_text = 'Hello,\n\nyour Task "{}" on {} is now starting up. Check logs or connect here: https://{}/tasks/?uuid={}\n\nThe Rosetta notifications bot.'.format(task.container.name, task.computing, settings.ROSETTA_HOST, task.uuid) + try: + send_email(to=task.user.email, subject=mail_subject, text=mail_text) + except Exception as e: + logger.error('Cannot send task ready email: "{}"'.format(e)) return HttpResponse('OK') diff --git a/services/webapp/code/rosetta/core_app/computing_managers.py b/services/webapp/code/rosetta/core_app/computing_managers.py index c0fb66363edb9cfc6a30db69da3d4ef05b6883ab..2bad86f340de41c66d12989b1fa4fde5606a0949 100644 --- a/services/webapp/code/rosetta/core_app/computing_managers.py +++ b/services/webapp/code/rosetta/core_app/computing_managers.py @@ -189,7 +189,11 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana webapp_conn_string = get_webapp_conn_string() # Handle container runtime - container_runtime = task.computing_options.get('container_runtime', task.computing.default_container_runtime) + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime # Runtime-specific part if container_runtime == 'singularity': @@ -242,10 +246,59 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana # Container part run_command+='docker://{}/{}:{} &>> /tmp/{}_data/task.log & echo \$!"\''.format(task.container.registry, task.container.image_name, task.container.image_tag, task.uuid) - + + elif container_runtime == 'docker': + + # Set pass if any + authstring = '' + if not task.requires_proxy_auth and task.password: + authstring = ' -e AUTH_PASS={} '.format(task.password) + + # Handle storages (binds) + binds = '' + storages = Storage.objects.filter(computing=self.computing) + for storage in storages: + if storage.type == 'generic_posix' and storage.bind_path: + + # Expand the base path + expanded_base_path = storage.base_path + if '$SSH_USER' in expanded_base_path: + if storage.access_through_computing: + expanded_base_path = expanded_base_path.replace('$SSH_USER', computing_user) + else: + raise NotImplementedError('Accessing a storage with ssh+cli without going through its computing resource is not implemented') + if '$USER' in expanded_base_path: + expanded_base_path = expanded_base_path.replace('$USER', self.task.user.name) + + # Expand the bind_path + expanded_bind_path = storage.bind_path + if '$SSH_USER' in expanded_bind_path: + if storage.access_through_computing: + expanded_bind_path = expanded_bind_path.replace('$SSH_USER', computing_user) + else: + raise NotImplementedError('Accessing a storage with ssh+cli without going through its computing resource is not implemented') + if '$USER' in expanded_bind_path: + expanded_bind_path = expanded_bind_path.replace('$USER', self.task.user.name) + + # Add the bind + if not binds: + binds = '-v{}:{}'.format(expanded_base_path, expanded_bind_path) + else: + binds += ' -v{}:{}'.format(expanded_base_path, expanded_bind_path) + + # TODO: remove this hardcoding + prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + + run_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} '.format(computing_keys.private_key_file, computing_user, computing_host) + run_command += '/bin/bash -c \'"rm -rf /tmp/{}_data && mkdir /tmp/{}_data && chmod 700 /tmp/{}_data && '.format(task.uuid, task.uuid, task.uuid) + run_command += 'wget {}/api/v1/base/agent/?task_uuid={} -O /tmp/{}_data/agent.py &> /dev/null && export TASK_PORT=\$(python /tmp/{}_data/agent.py 2> /tmp/{}_data/task.log) && '.format(webapp_conn_string, task.uuid, task.uuid, task.uuid, task.uuid) + run_command += '{} docker run -p \$TASK_PORT:{} {} {} '.format(prefix, task.container.interface_port, authstring, binds) + run_command += '-h task-{} -d -t {}/{}:{}'.format(task.uuid, task.container.registry, task.container.image_name, task.container.image_tag) + run_command += '"\'' + else: - raise NotImplementedError('Container runtime {} not supported'.format(self.computing.default_container_runtime)) + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) out = os_shell(run_command, capture=True) if out.exit_code != 0: @@ -270,11 +323,28 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana # Get credentials computing_user, computing_host, computing_keys = get_ssh_access_mode_credentials(self.computing, task.user) - # Stop the task remotely - stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "kill -9 {}"\''.format(computing_keys.private_key_file, computing_user, computing_host, task.id) + # Handle container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + if container_runtime=='singularity': + internal_stop_command = 'kill -9 {}'.format(task.id) + elif container_runtime=='docker': + # TODO: remove this hardcoding + prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + internal_stop_command = '{} docker stop {} && {} docker rm {}'.format(prefix,task.id,prefix,task.id) + else: + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) + + stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "{}"\''.format(computing_keys.private_key_file, computing_user, computing_host, internal_stop_command) out = os_shell(stop_command, capture=True) if out.exit_code != 0: - if not 'No such process' in out.stderr: + if ('No such process' in out.stderr) or ('No such container' in out.stderr): + pass + else: raise Exception(out.stderr) # Set task as stopped @@ -287,9 +357,26 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana # Get credentials computing_user, computing_host, computing_keys = get_ssh_access_mode_credentials(self.computing, task.user) - # View log remotely - view_log_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "cat /tmp/{}_data/task.log"\''.format(computing_keys.private_key_file, computing_user, computing_host, task.uuid) + # Handle container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + if container_runtime=='singularity': + internal_view_log_command = 'cat /tmp/{}_data/task.log'.format(task.uuid) + elif container_runtime=='docker': + # TODO: remove this hardcoding + prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + internal_view_log_command = '{} docker logs {}'.format(prefix,task.id) + else: + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) + + # Prepare full comand + view_log_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "{}"\''.format(computing_keys.private_key_file, computing_user, computing_host, internal_view_log_command) + # Execute out = os_shell(view_log_command, capture=True) if out.exit_code != 0: raise Exception(out.stderr) @@ -297,7 +384,6 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana return out.stdout - class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManager): def _start_task(self, task, **kwargs): @@ -331,7 +417,11 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag sbatch_args += ' --output=\$HOME/{}.log --error=\$HOME/{}.log '.format(task.uuid, task.uuid) # Handle container runtime - container_runtime = task.computing_options.get('container_runtime', task.computing.default_container_runtime) + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime # Runtime-specific part if container_runtime == 'singularity': @@ -386,7 +476,7 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag run_command+='docker://{}/{}:{} &> \$HOME/{}.log\\" > \$HOME/{}.sh && sbatch {} \$HOME/{}.sh"\''.format(task.container.registry, task.container.image_name, task.container.image_tag, task.uuid, task.uuid, sbatch_args, task.uuid) else: - raise NotImplementedError('Container runtime {} not supported'.format(task.computing.default_container_runtime)) + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) out = os_shell(run_command, capture=True) if out.exit_code != 0: