Skip to content
Snippets Groups Projects
Commit 2549207b authored by Stefano Alberto Russo's avatar Stefano Alberto Russo
Browse files

Fixes

parent 9d33e22a
No related branches found
No related tags found
No related merge requests found
...@@ -25,5 +25,6 @@ RUN ln -s /var/log/slurm-llnl /var/log/slurm-wlm ...@@ -25,5 +25,6 @@ RUN ln -s /var/log/slurm-llnl /var/log/slurm-wlm
# Add slurmtestuser user # Add slurmtestuser user
RUN useradd slurmtestuser RUN useradd slurmtestuser
RUN cp -a /rosetta/.ssh /home/slurmtestuser RUN mkdir -p /home/slurmtestuser/.ssh
RUN cat /rosetta/.ssh/id_rsa.pub >> /home/slurmtestuser/.ssh/authorized_keys
RUN chown -R slurmtestuser:slurmtestuser /home/slurmtestuser RUN chown -R slurmtestuser:slurmtestuser /home/slurmtestuser
...@@ -154,6 +154,7 @@ class RemoteComputingManager(ComputingManager): ...@@ -154,6 +154,7 @@ class RemoteComputingManager(ComputingManager):
# Get computing host # Get computing host
host = task.computing.get_conf_param('host') host = task.computing.get_conf_param('host')
user = task.computing.get_conf_param('user')
# Get user keys # Get user keys
if task.computing.require_user_keys: if task.computing.require_user_keys:
...@@ -178,8 +179,8 @@ class RemoteComputingManager(ComputingManager): ...@@ -178,8 +179,8 @@ class RemoteComputingManager(ComputingManager):
hostname = socket.gethostname() hostname = socket.gethostname()
webapp_ip = socket.gethostbyname(hostname) webapp_ip = socket.gethostbyname(hostname)
run_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} '.format(user_keys.private_key_file, host) run_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {}@{} '.format(user_keys.private_key_file, user, host)
run_command+= '"wget {}:8080/api/v1/base/agent/?task_uuid={} -O /tmp/agent_{}.py &> /dev/null && export BASE_PORT=\$(python /tmp/agent_{}.py 2> /tmp/{}.log) && '.format(webapp_ip, task.uuid, task.uuid, task.uuid, task.uuid) run_command += '/bin/bash -c \'"wget {}:8080/api/v1/base/agent/?task_uuid={} -O /tmp/agent_{}.py &> /dev/null && export BASE_PORT=\$(python /tmp/agent_{}.py 2> /tmp/{}.log) && '.format(webapp_ip, task.uuid, task.uuid, task.uuid, task.uuid)
run_command += 'export SINGULARITY_NOHTTPS=true && export SINGULARITYENV_BASE_PORT=\$BASE_PORT && {} '.format(authstring) run_command += 'export SINGULARITY_NOHTTPS=true && export SINGULARITYENV_BASE_PORT=\$BASE_PORT && {} '.format(authstring)
run_command += 'exec nohup singularity run --pid --writable-tmpfs --containall --cleanenv ' run_command += 'exec nohup singularity run --pid --writable-tmpfs --containall --cleanenv '
...@@ -198,7 +199,7 @@ class RemoteComputingManager(ComputingManager): ...@@ -198,7 +199,7 @@ class RemoteComputingManager(ComputingManager):
else: else:
raise NotImplementedError('Registry {} not supported'.format(task.container.registry)) raise NotImplementedError('Registry {} not supported'.format(task.container.registry))
run_command+='{}{} &>> /tmp/{}.log & echo \$!"'.format(registry, task.container.image, task.uuid) run_command+='{}{} &>> /tmp/{}.log & echo \$!"\''.format(registry, task.container.image, task.uuid)
else: else:
raise NotImplementedError('Container {} not supported'.format(task.container.type)) raise NotImplementedError('Container {} not supported'.format(task.container.type))
...@@ -236,9 +237,10 @@ class RemoteComputingManager(ComputingManager): ...@@ -236,9 +237,10 @@ class RemoteComputingManager(ComputingManager):
# Get computing host # Get computing host
host = task.computing.get_conf_param('host') host = task.computing.get_conf_param('host')
user = task.computing.get_conf_param('user')
# Stop the task remotely # Stop the task remotely
stop_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} "kill -9 {}"'.format(user_keys.private_key_file, host, task.pid) stop_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "kill -9 {}"\''.format(user_keys.private_key_file, user, host, task.pid)
logger.debug(stop_command) logger.debug(stop_command)
out = os_shell(stop_command, capture=True) out = os_shell(stop_command, capture=True)
if out.exit_code != 0: if out.exit_code != 0:
...@@ -255,10 +257,10 @@ class RemoteComputingManager(ComputingManager): ...@@ -255,10 +257,10 @@ class RemoteComputingManager(ComputingManager):
user_keys = Keys.objects.get(user=task.user, default=True) user_keys = Keys.objects.get(user=task.user, default=True)
id_rsa_file = user_keys.private_key_file id_rsa_file = user_keys.private_key_file
else: else:
raise NotImplementedError('temote with no keys not yet') raise NotImplementedError('Remote with no keys not yet')
# View the Singularity container log # View the Singularity container log
view_log_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} "cat /tmp/{}.log"'.format(id_rsa_file, host, task.uuid) view_log_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} \'/bin/bash -c "cat /tmp/{}.log"\''.format(id_rsa_file, host, task.uuid)
logger.debug(view_log_command) logger.debug(view_log_command)
out = os_shell(view_log_command, capture=True) out = os_shell(view_log_command, capture=True)
if out.exit_code != 0: if out.exit_code != 0:
...@@ -275,7 +277,8 @@ class SlurmComputingManager(ComputingManager): ...@@ -275,7 +277,8 @@ class SlurmComputingManager(ComputingManager):
# Get computing host #Key Error ATM # Get computing host #Key Error ATM
host = 'slurmclustermaster-main' #task.computing.get_conf_param('host') host = 'slurmclustermaster-main' #task.computing.get_conf_param('host')
user = task.computing.get_conf_param('user')
# Get user keys # Get user keys
if task.computing.require_user_keys: if task.computing.require_user_keys:
user_keys = Keys.objects.get(user=task.user, default=True) user_keys = Keys.objects.get(user=task.user, default=True)
...@@ -286,7 +289,7 @@ class SlurmComputingManager(ComputingManager): ...@@ -286,7 +289,7 @@ class SlurmComputingManager(ComputingManager):
if task.container.type == 'singularity': if task.container.type == 'singularity':
if not task.dynamic_ports: if not task.container.dynamic_ports:
raise Exception('This task does not support dynamic port allocation and is therefore not supported using singularity on Slurm') raise Exception('This task does not support dynamic port allocation and is therefore not supported using singularity on Slurm')
# Set pass if any # Set pass if any
...@@ -299,9 +302,9 @@ class SlurmComputingManager(ComputingManager): ...@@ -299,9 +302,9 @@ class SlurmComputingManager(ComputingManager):
hostname = socket.gethostname() hostname = socket.gethostname()
webapp_ip = socket.gethostbyname(hostname) webapp_ip = socket.gethostbyname(hostname)
run_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} '.format(user_keys.private_key_file, host) run_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {}@{} '.format(user_keys.private_key_file, user, host)
run_command += '"echo \\"#!/bin/bash\nwget {}:8080/api/v1/base/agent/?task_uuid={} -O /tmp/agent_{}.py &> /dev/null && export BASE_PORT=\\\\\\$(python /tmp/agent_{}.py 2> /tmp/{}.log) && '.format(webapp_ip, task.uuid, task.uuid, task.uuid, task.uuid) run_command += '\'bash -c "echo \\"#!/bin/bash\nwget {}:8080/api/v1/base/agent/?task_uuid={} -O /tmp/agent_{}.py &> /dev/null && export BASE_PORT=\\\\\\$(python /tmp/agent_{}.py 2> /tmp/{}.log) && '.format(webapp_ip, task.uuid, task.uuid, task.uuid, task.uuid)
run_command += 'export SINGULARITY_NOHTTPS=true && export SINGULARITYENV_BASE_PORT=\\\\\\$BASE_PORT && {} '.format(authstring) run_command += 'export SINGULARITY_NOHTTPS=true && export SINGULARITYENV_BASE_PORT=\\\\\\$BASE_PORT && {} '.format(authstring)
run_command += 'exec nohup singularity run --pid --writable-tmpfs --containall --cleanenv ' run_command += 'exec nohup singularity run --pid --writable-tmpfs --containall --cleanenv '
...@@ -321,7 +324,7 @@ class SlurmComputingManager(ComputingManager): ...@@ -321,7 +324,7 @@ class SlurmComputingManager(ComputingManager):
else: else:
raise NotImplementedError('Registry {} not supported'.format(task.container.registry)) raise NotImplementedError('Registry {} not supported'.format(task.container.registry))
run_command+='{}{} &> /tmp/{}.log\\" > /tmp/{}.sh && sbatch -p partition1 /tmp/{}.sh"'.format(registry, task.container.image, task.uuid, task.uuid, task.uuid) run_command+='{}{} &> /tmp/{}.log\\" > /tmp/{}.sh && sbatch -p partition1 /tmp/{}.sh"\''.format(registry, task.container.image, task.uuid, task.uuid, task.uuid)
else: else:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment