diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 166832edcf430981c109d71077d520726d94150d..16bc7b9b3612c48414947cba4e14aaa8be399205 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -21,6 +21,7 @@ services: privileged: true volumes: - ./data/shared:/shared + - /var/run/docker.sock:/var/run/docker.sock dregistry: container_name: dregistry diff --git a/services/slurmclusterworker/Dockerfile b/services/slurmclusterworker/Dockerfile index f72bb5e303e8770822e3816e6d8f06215d6e648d..c98c874420c8e05f2993b6d4e5947cbad4fb4dac 100755 --- a/services/slurmclusterworker/Dockerfile +++ b/services/slurmclusterworker/Dockerfile @@ -1,2 +1,8 @@ FROM rosetta/slurmcluster MAINTAINER Stefano Alberto Russo <stefano.russo@gmail.com> + +# Docker +RUN apt-get install docker.io -y + +# Add slurmtestuser user to sudoers +RUN adduser slurmtestuser sudo diff --git a/services/webapp/code/rosetta/core_app/api.py b/services/webapp/code/rosetta/core_app/api.py index 859559713d78173da2bee5ed844697215bcc3032..1e9494c63ff1e52a3f27ff5d335ccf738dbd26f3 100644 --- a/services/webapp/code/rosetta/core_app/api.py +++ b/services/webapp/code/rosetta/core_app/api.py @@ -323,18 +323,34 @@ print(port) logger.info('Setting task "{}" to ip "{}" and port "{}"'.format(task.uuid, task_interface_ip, task_interface_port)) task.status = TaskStatuses.running task.interface_ip = task_interface_ip - if task.container.supports_custom_interface_port: + + # Get container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + if container_runtime=='singularity': + # For Singularity, set this only if the container supports custom interface ports + if task.container.supports_custom_interface_port: + task.interface_port = int(task_interface_port) + else: + # For all other container runtimes, set it in any case task.interface_port = int(task_interface_port) + + # Save the task task.save() # Notify the user that the task called back home - logger.info('Sending task ready mail notification to "{}"'.format(task.user.email)) - mail_subject = 'Your Task "{}" is now starting up'.format(task.container.name) - mail_text = 'Hello,\n\nyour Task "{}" on {} is now starting up. Check logs or connect here: https://{}/tasks/?uuid={}\n\nThe Rosetta notifications bot.'.format(task.container.name, task.computing, settings.ROSETTA_HOST, task.uuid) - try: - send_email(to=task.user.email, subject=mail_subject, text=mail_text) - except Exception as e: - logger.error('Cannot send task ready email: "{}"'.format(e)) + if settings.DJANGO_EMAIL_APIKEY: + logger.info('Sending task ready mail notification to "{}"'.format(task.user.email)) + mail_subject = 'Your Task "{}" is now starting up'.format(task.container.name) + mail_text = 'Hello,\n\nyour Task "{}" on {} is now starting up. Check logs or connect here: https://{}/tasks/?uuid={}\n\nThe Rosetta notifications bot.'.format(task.container.name, task.computing, settings.ROSETTA_HOST, task.uuid) + try: + send_email(to=task.user.email, subject=mail_subject, text=mail_text) + except Exception as e: + logger.error('Cannot send task ready email: "{}"'.format(e)) return HttpResponse('OK') diff --git a/services/webapp/code/rosetta/core_app/computing_managers.py b/services/webapp/code/rosetta/core_app/computing_managers.py index 1e4dfa5de7138b5031558d369b46a3d4a362752b..37d1406adcdb05e0e20623595832fce3f1a9b7c4 100644 --- a/services/webapp/code/rosetta/core_app/computing_managers.py +++ b/services/webapp/code/rosetta/core_app/computing_managers.py @@ -73,7 +73,7 @@ class ComputingManager(object): return self._get_task_log(task, **kwargs) -class SingleNodeComputingManager(ComputingManager): +class StandaloneComputingManager(ComputingManager): pass @@ -87,7 +87,7 @@ class SSHComputingManager(ComputingManager): -class InternalSingleNodeComputingManager(SingleNodeComputingManager): +class InternalStandaloneComputingManager(StandaloneComputingManager): def _start_task(self, task): @@ -111,7 +111,7 @@ class InternalSingleNodeComputingManager(SingleNodeComputingManager): #run_command += ' -v {}/user-{}:/data'.format(settings.LOCAL_USER_DATA_DIR, task.user.id) # Host name, image entry command - run_command += ' -h task-{} -d -t {}/{}:{}'.format(task.uuid, task.container.registry, task.container.image_name, task.container.image_tag) + run_command += ' -h task-{} -d -t {}/{}:{}'.format(task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag) # Debug logger.debug('Running new task with command="{}"'.format(run_command)) @@ -176,7 +176,7 @@ class InternalSingleNodeComputingManager(SingleNodeComputingManager): -class SSHSingleNodeComputingManager(SingleNodeComputingManager, SSHComputingManager): +class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingManager): def _start_task(self, task, **kwargs): logger.debug('Starting a remote task "{}"'.format(self.computing)) @@ -188,8 +188,15 @@ class SSHSingleNodeComputingManager(SingleNodeComputingManager, SSHComputingMana from.utils import get_webapp_conn_string webapp_conn_string = get_webapp_conn_string() - # Handle container runtime - if self.computing.default_container_runtime == 'singularity': + # Handle container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + # Runtime-specific part + if container_runtime == 'singularity': #if not task.container.supports_custom_interface_port: # raise Exception('This task does not support dynamic port allocation and is therefore not supported using singularity on Slurm') @@ -239,10 +246,59 @@ class SSHSingleNodeComputingManager(SingleNodeComputingManager, SSHComputingMana # Container part run_command+='docker://{}/{}:{} &>> /tmp/{}_data/task.log & echo \$!"\''.format(task.container.registry, task.container.image_name, task.container.image_tag, task.uuid) - + + elif container_runtime == 'docker': + + # Set pass if any + authstring = '' + if not task.requires_proxy_auth and task.password: + authstring = ' -e AUTH_PASS={} '.format(task.password) + + # Handle storages (binds) + binds = '' + storages = Storage.objects.filter(computing=self.computing) + for storage in storages: + if storage.type == 'generic_posix' and storage.bind_path: + + # Expand the base path + expanded_base_path = storage.base_path + if '$SSH_USER' in expanded_base_path: + if storage.access_through_computing: + expanded_base_path = expanded_base_path.replace('$SSH_USER', computing_user) + else: + raise NotImplementedError('Accessing a storage with ssh+cli without going through its computing resource is not implemented') + if '$USER' in expanded_base_path: + expanded_base_path = expanded_base_path.replace('$USER', self.task.user.name) + + # Expand the bind_path + expanded_bind_path = storage.bind_path + if '$SSH_USER' in expanded_bind_path: + if storage.access_through_computing: + expanded_bind_path = expanded_bind_path.replace('$SSH_USER', computing_user) + else: + raise NotImplementedError('Accessing a storage with ssh+cli without going through its computing resource is not implemented') + if '$USER' in expanded_bind_path: + expanded_bind_path = expanded_bind_path.replace('$USER', self.task.user.name) + + # Add the bind + if not binds: + binds = '-v{}:{}'.format(expanded_base_path, expanded_bind_path) + else: + binds += ' -v{}:{}'.format(expanded_base_path, expanded_bind_path) + + # TODO: remove this hardcoding + prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + + run_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} '.format(computing_keys.private_key_file, computing_user, computing_host) + run_command += '/bin/bash -c \'"rm -rf /tmp/{}_data && mkdir /tmp/{}_data && chmod 700 /tmp/{}_data && '.format(task.uuid, task.uuid, task.uuid) + run_command += 'wget {}/api/v1/base/agent/?task_uuid={} -O /tmp/{}_data/agent.py &> /dev/null && export TASK_PORT=\$(python /tmp/{}_data/agent.py 2> /tmp/{}_data/task.log) && '.format(webapp_conn_string, task.uuid, task.uuid, task.uuid, task.uuid) + run_command += '{} docker run -p \$TASK_PORT:{} {} {} '.format(prefix, task.container.interface_port, authstring, binds) + run_command += '-h task-{} -d -t {}/{}:{}'.format(task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag) + run_command += '"\'' + else: - raise NotImplementedError('Container runtime {} not supported'.format(self.computing.default_container_runtime)) + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) out = os_shell(run_command, capture=True) if out.exit_code != 0: @@ -267,11 +323,28 @@ class SSHSingleNodeComputingManager(SingleNodeComputingManager, SSHComputingMana # Get credentials computing_user, computing_host, computing_keys = get_ssh_access_mode_credentials(self.computing, task.user) - # Stop the task remotely - stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "kill -9 {}"\''.format(computing_keys.private_key_file, computing_user, computing_host, task.id) + # Handle container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + if container_runtime=='singularity': + internal_stop_command = 'kill -9 {}'.format(task.id) + elif container_runtime=='docker': + # TODO: remove this hardcoding + prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + internal_stop_command = '{} docker stop {} && {} docker rm {}'.format(prefix,task.id,prefix,task.id) + else: + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) + + stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "{}"\''.format(computing_keys.private_key_file, computing_user, computing_host, internal_stop_command) out = os_shell(stop_command, capture=True) if out.exit_code != 0: - if not 'No such process' in out.stderr: + if ('No such process' in out.stderr) or ('No such container' in out.stderr): + pass + else: raise Exception(out.stderr) # Set task as stopped @@ -284,9 +357,26 @@ class SSHSingleNodeComputingManager(SingleNodeComputingManager, SSHComputingMana # Get credentials computing_user, computing_host, computing_keys = get_ssh_access_mode_credentials(self.computing, task.user) - # View log remotely - view_log_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "cat /tmp/{}_data/task.log"\''.format(computing_keys.private_key_file, computing_user, computing_host, task.uuid) + # Handle container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + if container_runtime=='singularity': + internal_view_log_command = 'cat /tmp/{}_data/task.log'.format(task.uuid) + elif container_runtime=='docker': + # TODO: remove this hardcoding + prefix = 'sudo' if computing_host == 'slurmclusterworker-one' else '' + internal_view_log_command = '{} docker logs {}'.format(prefix,task.id) + else: + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) + + # Prepare full comand + view_log_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "{}"\''.format(computing_keys.private_key_file, computing_user, computing_host, internal_view_log_command) + # Execute out = os_shell(view_log_command, capture=True) if out.exit_code != 0: raise Exception(out.stderr) @@ -294,7 +384,6 @@ class SSHSingleNodeComputingManager(SingleNodeComputingManager, SSHComputingMana return out.stdout - class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManager): def _start_task(self, task, **kwargs): @@ -327,8 +416,15 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag # Set output and error files sbatch_args += ' --output=\$HOME/{}.log --error=\$HOME/{}.log '.format(task.uuid, task.uuid) - # Submit the job - if task.computing.default_container_runtime == 'singularity': + # Handle container runtime + container_runtime = None + if task.computing_options: + container_runtime = task.computing_options.get('container_runtime', None) + if not container_runtime: + container_runtime = task.computing.default_container_runtime + + # Runtime-specific part + if container_runtime == 'singularity': #if not task.container.supports_custom_interface_port: # raise Exception('This task does not support dynamic port allocation and is therefore not supported using singularity on Slurm') @@ -380,7 +476,7 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag run_command+='docker://{}/{}:{} &> \$HOME/{}.log\\" > \$HOME/{}.sh && sbatch {} \$HOME/{}.sh"\''.format(task.container.registry, task.container.image_name, task.container.image_tag, task.uuid, task.uuid, sbatch_args, task.uuid) else: - raise NotImplementedError('Container runtime {} not supported'.format(task.computing.default_container_runtime)) + raise NotImplementedError('Container runtime {} not supported'.format(container_runtime)) out = os_shell(run_command, capture=True) if out.exit_code != 0: diff --git a/services/webapp/code/rosetta/core_app/models.py b/services/webapp/code/rosetta/core_app/models.py index 968a30340995c963915e9137ce316dc384a0d3b0..f98c409341308670e52021a959663c9ef52215db 100644 --- a/services/webapp/code/rosetta/core_app/models.py +++ b/services/webapp/code/rosetta/core_app/models.py @@ -239,19 +239,19 @@ class Computing(models.Model): def manager(self): from . import computing_managers - # Instantiate the computing manager based on type (if not already done) + # Hash table mapping + managers_mapping = {} + managers_mapping['cluster'+'ssh+cli'+'user_keys'+'slurm'] = computing_managers.SlurmSSHClusterComputingManager + managers_mapping['standalone'+'ssh+cli'+'user_keys'+'None'] = computing_managers.SSHStandaloneComputingManager + managers_mapping['standalone'+'internal'+'internal'+'None'] = computing_managers.InternalStandaloneComputingManager + + # Instantiate the computing manager and return (if not already done) try: return self._manager except AttributeError: - if self.type == 'cluster' and self.access_mode == 'ssh+cli' and self.auth_mode == 'user_keys' and self.wms == 'slurm': - self._manager = computing_managers.SlurmSSHClusterComputingManager(self) - elif self.type == 'standalone' and self.access_mode == 'ssh+cli' and self.auth_mode == 'user_keys' and self.wms is None: - self._manager = computing_managers.SSHSingleNodeComputingManager(self) - elif self.type == 'standalone' and self.access_mode == 'internal' and self.auth_mode == 'internal' and self.wms is None: - self._manager = computing_managers.InternalSingleNodeComputingManager(self) - else: - raise ConsistencyException('Don\'t know how to instantiate a computing manager for computing resource of type "{}", access mode "{}" and WMS "{}"'.format(self.type, self.access_mode, self.wms)) + self._manager = managers_mapping[self.type+self.access_mode+self.auth_mode+str(self.wms)](self) return self._manager + #========================= @@ -331,6 +331,10 @@ class Task(models.Model): def __str__(self): return str('Task "{}" of user "{}" running on "{}" in status "{}" created at "{}"'.format(self.name, self.user.email, self.computing, self.status, self.created)) + @property + def short_uuid(self): + return str(self.uuid)[0:8] + @property def color(self): string_int_hash = hash_string_to_int(self.name) @@ -339,7 +343,7 @@ class Task(models.Model): @property def sharable_link(self): - return 'https://{}/t/{}'.format(settings.ROSETTA_HOST, str(self.uuid)[0:8]) + return 'https://{}/t/{}'.format(settings.ROSETTA_HOST, self.short_uuid) @property def tcp_tunnel_host(self): diff --git a/services/webapp/code/rosetta/core_app/templates/components/computing.html b/services/webapp/code/rosetta/core_app/templates/components/computing.html index 0ad13b5d71d7090ba89a27cadec6fd23690986b1..1e6935f480c647664433b1805d61aa7f0cc6fc28 100644 --- a/services/webapp/code/rosetta/core_app/templates/components/computing.html +++ b/services/webapp/code/rosetta/core_app/templates/components/computing.html @@ -93,8 +93,8 @@ <br/> {% endif %} <div class="image-version-box"> - <b>Type:</b> {{ computing.type }}<br/> - <b>Arch:</b> {{ computing.arch }} + <b>Type:</b> {{ computing.type }} <font style="font-size:0.9em">({{ computing.arch }})</font> + <!-- <br/><b>Arch:</b> {{ computing.arch }} --> <br/> <b>Storages:</b> {% if not computing.storages.all %} diff --git a/services/webapp/code/rosetta/core_app/templates/components/task.html b/services/webapp/code/rosetta/core_app/templates/components/task.html index effe0f1ccb5a21bf3e92b3ae3cc1456546993aa0..f7d83fb1a0a28c9cbf39a324df511b5441784fbc 100644 --- a/services/webapp/code/rosetta/core_app/templates/components/task.html +++ b/services/webapp/code/rosetta/core_app/templates/components/task.html @@ -107,9 +107,14 @@ <td>{{ task.name }}</td> </tr> --> - <tr> + <!-- <tr> <td><b>ID</b></td> <td>{{ task.id }}</td> + </tr> --> + + <tr> + <td><b>Short ID</b></td> + <td>{{ task.short_uuid }}</td> </tr> <tr> @@ -143,6 +148,13 @@ <td>{{ task.auth_token }}</td> </tr> {% endif %} + + {% if task.computing_options %} + <tr> + <td><b>Computing options</b></td> + <td>{{ task.computing_options }}</td> + </tr> + {% endif %} <!-- <tr><td style="padding-right:0"><b>Direct link</b> diff --git a/services/webapp/code/rosetta/core_app/templates/new_task.html b/services/webapp/code/rosetta/core_app/templates/new_task.html index 807e554010b057c255154ef7cf15a7d243bf77fd..991c902cb115d1ba9c8a4b01ac0a08f9964d8875 100644 --- a/services/webapp/code/rosetta/core_app/templates/new_task.html +++ b/services/webapp/code/rosetta/core_app/templates/new_task.html @@ -91,7 +91,12 @@ </tr> <tr> - <td valign="top"><b>Task password</b></td> + <td valign="top"> + {% if request.user.profile.is_power_user %} + <b>Task password</b></td> + {% else %} + <b>Auth token</b></td> + {% endif %} <td> <input type="hidden" name="task_auth_token" value="{{data.task_auth_token}}"> @@ -121,25 +126,20 @@ </select> </td> </tr> - {% else %} - <tr> - <td><b>Access method</b></td><td> - <select name="access_method" > - <option value="auto" selected>Auto</option> - </select> - </td> - </tr> {% endif %} + {% if data.task_computing.container_runtimes|length > 1 %} <tr> - <td><b>Container runtime</b></td><td> - <select name="run_using" > - <option value="default" selected>Default</option> - <!-- <option value="docker" selected>default</option> --> - <!-- <option value="singularity">Singularity</option> --> + <td><b>Container runtime</b></td><td> + <select name="container_runtime" > + <option value="" selected>Default</option> + {% for container_runtime in data.task_computing.container_runtimes %} + <option value="{{container_runtime}}">{{container_runtime}}</option> + {% endfor %} </select> </td> </tr> + {% endif %} {% if data.task_computing.wms == 'slurm' %} <tr> diff --git a/services/webapp/code/rosetta/core_app/views.py b/services/webapp/code/rosetta/core_app/views.py index 2e75f1ae5884da4fd367e0c1a4c3c58518b8c695..21687edb32af4057aa73d3941321277030703d8a 100644 --- a/services/webapp/code/rosetta/core_app/views.py +++ b/services/webapp/code/rosetta/core_app/views.py @@ -586,7 +586,7 @@ def new_task(request): task.requires_tcp_tunnel = True # Task access method - access_method = request.POST.get('access_method', None) + access_method = request.POST.get('access_method', 'auto') if access_method and access_method != 'auto' and not request.user.profile.is_power_user: raise ErrorMessage('Sorry, only power users can set a task access method other than \'auto\'.') if access_method == 'auto': @@ -605,12 +605,21 @@ def new_task(request): else: raise ErrorMessage('Unknown access method "{}"'.format(access_method)) - # Computing options # TODO: This is hardcoded thinking about Slurm and Singularity + # Computing options + computing_options = {} + + # Container runtime if any set + container_runtime = request.POST.get('container_runtime', None) + if container_runtime: + if not container_runtime in data['task_computing'].container_runtimes: + raise ErrorMessage('Unknown container runtime "{}"'.format(container_runtime)) + computing_options['container_runtime'] = container_runtime + + # CPUs, memory and partition if set computing_cpus = request.POST.get('computing_cpus', None) computing_memory = request.POST.get('computing_memory', None) computing_partition = request.POST.get('computing_partition', None) - computing_options = {} if computing_cpus: try: int(computing_cpus)