Skip to content
Snippets Groups Projects
Commit 074a2e7b authored by Stefano Alberto Russo's avatar Stefano Alberto Russo
Browse files

Moved to dynamically obtain container IDs when stopping tasks. Added Podman...

Moved to dynamically obtain container IDs when stopping tasks. Added Podman and Docker container names. Improved how task Process IDs and Job IDs are stored.
parent 5badfd1c
No related branches found
No related tags found
No related merge requests found
......@@ -132,7 +132,7 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
#run_command += ' -v {}/user-{}:/data'.format(settings.LOCAL_USER_DATA_DIR, task.user.id)
# Host name, image entry command
run_command += ' -h task-{} -d -t {}/{}:{}'.format(task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag)
run_command += ' -h task-{} --name task-{} -d -t {}/{}:{}'.format(task.short_uuid, task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag)
# Debug
logger.debug('Running new task with command="{}"'.format(run_command))
......@@ -140,19 +140,17 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
# Run the task
out = os_shell(run_command, capture=True)
if out.exit_code != 0:
logger.error('Got error in starting task: {}'.format(out))
raise Exception(out.stderr)
else:
tid = out.stdout
logger.debug('Created task with id: "{}"'.format(tid))
# Get task IP address
out = os_shell('sudo docker inspect --format \'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}\' ' + tid + ' | tail -n1', capture=True)
out = os_shell('export CONTAINER_ID=$(sudo docker ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) && sudo docker inspect --format \'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}\' $CONTAINER_ID | tail -n1', capture=True)
if out.exit_code != 0:
raise Exception('Error: ' + out.stderr)
task_ip = out.stdout
# Set fields
task.id = tid
task.status = TaskStatuses.running
task.interface_ip = task_ip
task.interface_port = task_port
......@@ -160,17 +158,10 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
# Save
task.save()
# Wait 10 seconds to see if the task is still up...
def _stop_task(self, task):
# Delete the Docker container
standby_supported = False
if standby_supported:
stop_command = 'sudo docker stop {}'.format(task.id)
else:
stop_command = 'sudo docker stop {} && sudo docker rm {}'.format(task.id,task.id)
stop_command = 'export CONTAINER_ID=$(sudo docker ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) && sudo docker stop $CONTAINER_ID && sudo docker rm $CONTAINER_ID'
out = os_shell(stop_command, capture=True)
if out.exit_code != 0:
......@@ -187,7 +178,7 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
def _get_task_log(self, task, **kwargs):
# View the Docker container log (attach)
view_log_command = 'sudo docker logs {}'.format(task.id,)
view_log_command = 'export CONTAINER_ID=$(sudo docker ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) && sudo docker logs $CONTAINER_ID'
logger.debug(view_log_command)
out = os_shell(view_log_command, capture=True)
if out.exit_code != 0:
......@@ -341,13 +332,14 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
run_command += '--network=private --uts=private --userns=keep-id '
#run_command += '-d -t {}/{}:{}'.format(task.container.registry, task.container.image_name, task.container.image_tag)
run_command += '-h task-{} --name task-{} -t {}/{}:{}'.format(task.short_uuid, task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag)
run_command += '&>> /tmp/{}_data/task.log & echo $({} ps -a --filter name=task-{} --format="{{.ID}}")"\''.format(task.uuid, container_engine, task.short_uuid)
run_command += '&>> /tmp/{}_data/task.log &"\''.format(task.uuid)
else:
raise NotImplementedError('Container engine {} not supported'.format(container_engine))
out = os_shell(run_command, capture=True)
if out.exit_code != 0:
logger.error('Got error in starting task: {}'.format(out))
raise Exception(out.stderr)
# Log
......@@ -357,8 +349,9 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
task_uuid = task.uuid
task = Task.objects.get(uuid=task_uuid)
# Save pid echoed by the command above
task.id = out.stdout
# Save the task (container) id for Singularity, which is the PID echoed by the command above
if container_engine == 'singularity':
task.process_id = out.stdout
# Save
task.save()
......@@ -377,20 +370,23 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
container_engine = task.computing.default_container_engine
if container_engine=='singularity':
internal_stop_command = 'kill -9 {}'.format(task.id)
internal_stop_command = 'kill -9 {}'.format(task.process_id)
elif container_engine in ['docker', 'podman']:
# TODO: remove this hardcoding
prefix = 'sudo' if (computing_host == 'slurmclusterworker' and container_engine=='docker') else ''
internal_stop_command = '{} {} stop {} && {} {} rm {}'.format(prefix,container_engine,task.id,prefix,container_engine,task.id)
internal_stop_command = 'export CONTAINER_ID=$('+prefix+' '+container_engine+' ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) &&'
internal_stop_command += 'if [ "x$CONTAINER_ID" != "x" ]; then {} {} stop $CONTAINER_ID && {} {} rm $CONTAINER_ID; fi'.format(prefix,container_engine,prefix,container_engine)
else:
raise NotImplementedError('Container engine {} not supported'.format(container_engine))
stop_command = 'ssh -p {} -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "{}"\''.format(computing_port, computing_keys.private_key_file, computing_user, computing_host, internal_stop_command)
out = os_shell(stop_command, capture=True)
if out.exit_code != 0:
if ('No such process' in out.stderr) or ('No such container' in out.stderr) or ('no container' in out.stderr) or ('missing' in out.stderr):
pass
else:
logger.critical('Got error in stopping task: {}'.format(out))
raise Exception(out.stderr)
# Set task as stopped
......@@ -413,9 +409,7 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
if container_engine=='singularity':
internal_view_log_command = 'cat /tmp/{}_data/task.log'.format(task.uuid)
elif container_engine in ['docker','podman']:
# TODO: remove this hardcoding
#prefix = 'sudo' if (computing_host == 'slurmclusterworker' and container_engine=='docker') else ''
#internal_view_log_command = '{} {} logs {}'.format(prefix,container_engine,task.id)
# TODO: consider podman/docker logs?
internal_view_log_command = 'cat /tmp/{}_data/task.log'.format(task.uuid)
else:
raise NotImplementedError('Container engine {} not supported'.format(container_engine))
......@@ -538,6 +532,7 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
out = os_shell(run_command, capture=True)
if out.exit_code != 0:
logger.error('Got error in starting task: {}'.format(out))
raise Exception(out.stderr)
# Log
......@@ -554,8 +549,8 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
task_uuid = task.uuid
task = Task.objects.get(uuid=task_uuid)
# Save job id as task id
task.id = job_id
# Save job id
task.job_id = job_id
# Set status (only fi we get here before the agent which sets the status as running via the API)
if task.status != TaskStatuses.running:
......@@ -571,7 +566,7 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
computing_user, computing_host, computing_port, computing_keys = get_ssh_access_mode_credentials(self.computing, task.user)
# Stop the task remotely
stop_command = 'ssh -o -p {} LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "scancel {}"\''.format(computing_port, computing_keys.private_key_file, computing_user, computing_host, task.id)
stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "scancel {}"\''.format(computing_keys.private_key_file, computing_user, computing_host, task.job_id)
out = os_shell(stop_command, capture=True)
if out.exit_code != 0:
raise Exception(out.stderr)
......
# Generated by Django 2.2.1 on 2023-10-07 12:49
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core_app', '0034_auto_20231007_1052'),
]
operations = [
migrations.RemoveField(
model_name='task',
name='id',
),
migrations.AddField(
model_name='task',
name='job_id',
field=models.CharField(blank=True, max_length=64, null=True, verbose_name='Job ID'),
),
migrations.AddField(
model_name='task',
name='process_id',
field=models.CharField(blank=True, max_length=64, null=True, verbose_name='Process ID'),
),
]
......@@ -274,9 +274,10 @@ class Task(models.Model):
name = models.CharField('Name', max_length=36, blank=False, null=False)
# Task management
id = models.CharField('ID', max_length=64, blank=True, null=True) # i.e. Slurm job id, singularity PID, docker hash
status = models.CharField('Status', max_length=36, blank=True, null=True)
created = models.DateTimeField('Created on', default=timezone.now)
process_id = models.CharField('Process ID', max_length=64, blank=True, null=True) # i.e. Singularity PID
job_id = models.CharField('Job ID', max_length=64, blank=True, null=True) # i.e. Slurm job id
# How to reach the task interface. The IP has to be intended either as the container IP if this is directly
# reachable (i.e. using a Docker or Kubernetes network) or as the host IP address, depending on the
......
......@@ -112,8 +112,8 @@
</tr> -->
<!-- <tr>
<td><b>ID</b></td>
<td>{{ task.id }}</td>
<td><b>UUID</b></td>
<td>{{ task.uuid }}</td>
</tr> -->
<tr>
......
......@@ -12,7 +12,7 @@
<hr>
<b>ID:</b> {{ data.task.id }} &nbsp; &nbsp;
<b>UUID:</b> {{ data.task.uuid }} &nbsp; &nbsp;
<b>Status:</b> {{ data.task.status }} &nbsp; &nbsp;
<b>Auto refresh:{{data.refresh}}</b>&nbsp;
{% if not data.refresh %} OFF {% else %} <a href="?uuid={{data.task.uuid}}">OFF</a> {% endif %} |
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment