Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
V
vospace-transfer-service
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
VOSpace INAF
vospace-transfer-service
Commits
2e7c0a35
Commit
2e7c0a35
authored
Sep 3, 2021
by
Cristiano Urban
Browse files
Options
Downloads
Patches
Plain Diff
Added basic logging and exception handling.
Signed-off-by:
Cristiano Urban
<
cristiano.urban@inaf.it
>
parent
82bd6235
No related branches found
No related tags found
No related merge requests found
Pipeline
#2298
passed
Sep 3, 2021
Stage: dockerize
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
transfer_service/retrieve_executor.py
+284
-184
284 additions, 184 deletions
transfer_service/retrieve_executor.py
with
284 additions
and
184 deletions
transfer_service/retrieve_executor.py
+
284
−
184
View file @
2e7c0a35
...
...
@@ -87,22 +87,39 @@ class RetrieveExecutor(TaskExecutor):
def
buildFileList
(
self
):
"""
Generates the list of all files to retrieve.
This method returns
'
True
'
on success,
'
False
'
on failure.
"""
try
:
try
:
self
.
dbConn
.
setPhase
(
self
.
jobId
,
"
EXECUTING
"
)
self
.
dbConn
.
setStartTime
(
self
.
jobId
)
except
Exception
:
self
.
logger
.
exception
(
"
FATAL: unable to update the file catalog.
"
)
return
False
else
:
self
.
logger
.
info
(
"
Job phase updated to EXECUTING.
"
)
self
.
logger
.
info
(
"
Building the list of the files to be retrieved...
"
)
# debug block...
if
os
.
path
.
exists
(
"
nodeList.txt
"
):
os
.
remove
(
"
nodeList.txt
"
)
nl
=
open
(
"
nodeList.txt
"
,
'
w
'
)
for
vospacePath
in
self
.
nodeList
:
nl
.
write
(
vospacePath
+
'
\n
'
)
nl
.
close
()
#
if os.path.exists("nodeList.txt"):
#
os.remove("nodeList.txt")
#
nl = open("nodeList.txt", 'w')
#
for vospacePath in self.nodeList:
#
nl.write(vospacePath + '\n')
#
nl.close()
# Obtain the storage type
try
:
self
.
storageType
=
self
.
dbConn
.
getOSPath
(
self
.
nodeList
[
0
])[
"
storageType
"
]
except
Exception
:
self
.
logger
.
exception
(
"
FATAL: unable to obtain the storage type.
"
)
return
False
for
vospacePath
in
self
.
nodeList
:
try
:
nodeInfo
=
self
.
dbConn
.
getOSPath
(
vospacePath
)
except
Exception
:
self
.
logger
.
exception
(
f
"
FATAL: unable to obtain the OS path for the VOSpace path
'
{
vospacePath
}
'
.
"
)
return
False
baseSrcPath
=
nodeInfo
[
"
baseSrcPath
"
]
srcPath
=
nodeInfo
[
"
fullPath
"
]
username
=
nodeInfo
[
"
username
"
]
...
...
@@ -134,16 +151,24 @@ class RetrieveExecutor(TaskExecutor):
self
.
fileList
.
append
(
fileInfo
.
copy
())
# debug block...
if
os
.
path
.
exists
(
"
fileList.txt
"
):
os
.
remove
(
"
fileList.txt
"
)
fl
=
open
(
"
fileList.txt
"
,
'
w
'
)
fl
.
write
(
json
.
dumps
(
self
.
fileList
,
indent
=
4
))
fl
.
close
()
#if os.path.exists("fileList.txt"):
# os.remove("fileList.txt")
#fl = open("fileList.txt", 'w')
#fl.write(json.dumps(self.fileList, indent = 4))
#fl.close()
except
Exception
:
self
.
logger
.
exception
(
"
FATAL: something went wrong while building the list of the files to be retrieved.
"
)
return
False
else
:
return
True
def
buildBlocks
(
self
):
"""
Algorithm to split data in blocks of a well known size.
This method returns
'
True
'
on success,
'
False
'
on failure.
"""
try
:
self
.
logger
.
info
(
"
Building the blocks data structure...
"
)
if
self
.
fileList
:
blockIdx
=
0
blockSize
=
0
...
...
@@ -180,15 +205,24 @@ class RetrieveExecutor(TaskExecutor):
blockSize
=
fileSize
if
self
.
fileList
:
self
.
numBlocks
=
blockIdx
+
1
try
:
self
.
dbConn
.
setTotalBlocks
(
self
.
jobId
,
self
.
numBlocks
)
except
Exception
:
self
.
logger
.
exception
(
"
FATAL: unable to set the total number of blocks in the database.
"
)
return
False
# debug block...
print
(
f
"
numBlocks =
{
self
.
numBlocks
}
"
)
if
os
.
path
.
exists
(
"
blocks.txt
"
):
os
.
remove
(
"
blocks.txt
"
)
fl
=
open
(
"
blocks.txt
"
,
'
w
'
)
fl
.
write
(
json
.
dumps
(
self
.
fileList
,
indent
=
4
))
fl
.
close
()
#print(f"numBlocks = {self.numBlocks}")
#if os.path.exists("blocks.txt"):
# os.remove("blocks.txt")
#fl = open("blocks.txt", 'w')
#fl.write(json.dumps(self.fileList, indent = 4))
#fl.close()
except
Exception
:
self
.
logger
.
exception
(
"
FATAL: something went wrong while building the blocks data structure.
"
)
return
False
else
:
return
True
def
retrieveCompleted
(
self
,
vospacePath
):
"""
...
...
@@ -201,7 +235,8 @@ class RetrieveExecutor(TaskExecutor):
"""
Retrieves data from a generic storage point (hot or cold).
"""
try
:
self
.
logger
.
info
(
"
Starting data retrieval...
"
)
# Loop on blocks
for
blockIdx
in
range
(
self
.
numBlocks
):
blockFileList
=
[
f
for
f
in
self
.
fileList
if
f
[
"
blockIdx
"
]
==
blockIdx
]
...
...
@@ -226,6 +261,7 @@ class RetrieveExecutor(TaskExecutor):
os
.
makedirs
(
destDirPath
,
exist_ok
=
True
)
sp
=
subprocess
.
run
([
"
rsync
"
,
"
-av
"
,
srcPath
,
destDirPath
],
capture_output
=
True
)
if
(
sp
.
returncode
or
sp
.
stderr
):
self
.
logger
.
error
(
f
"
FATAL: error during the copy process, returnCode =
{
sp
.
returncode
}
, stderr:
{
sp
.
stderr
}
"
)
return
False
# Remove files from file list at the end of the copy
...
...
@@ -238,7 +274,11 @@ class RetrieveExecutor(TaskExecutor):
# flag
for
vospacePath
in
self
.
nodeList
:
if
self
.
retrieveCompleted
(
vospacePath
):
try
:
self
.
dbConn
.
setAsyncTrans
(
vospacePath
,
False
)
except
Exception
:
self
.
logger
.
exception
(
"
FATAL: unable to update the file catalog.
"
)
return
False
# Empty the tape library frontend if the storage type
# is 'cold'
...
...
@@ -250,22 +290,48 @@ class RetrieveExecutor(TaskExecutor):
blockFileList
.
clear
()
self
.
procBlocks
+=
1
self
.
dbConn
.
updateProcessedBlocks
(
self
.
jobId
,
self
.
procBlocks
)
except
Exception
:
self
.
logger
.
exception
(
"
FATAL: something went wrong while retrieving the data.
"
)
return
False
else
:
return
True
def
update
(
self
):
def
execute
(
self
):
success
=
True
self
.
logger
.
info
(
"
++++++++++ Start of execution phase ++++++++++
"
)
success
&=
self
.
buildFileList
()
&
self
.
buildBlocks
()
&
self
.
retrieveData
()
if
success
:
self
.
logger
.
info
(
"
++++++++++ End of execution phase ++++++++++
"
)
return
True
else
:
self
.
logger
.
info
(
"
FATAL: something went wrong during the execution phase.
"
)
return
False
def
update
(
self
,
status
):
"""
Updates the job status and sends an email to the user.
"""
try
:
results
=
[{
"
target
"
:
""
}]
results
[
0
][
"
target
"
]
=
self
.
jobObj
.
jobInfo
[
"
transfer
"
][
"
target
"
]
#self.dbConn.setResults(self.jobId, results)
m
=
Mailer
(
self
.
logger
)
m
.
addRecipient
(
self
.
adminEmail
)
userEmail
=
self
.
dbConn
.
getUserEmail
(
self
.
jobObj
.
ownerId
)
if
userEmail
!=
self
.
adminEmail
:
m
.
addRecipient
(
userEmail
)
self
.
jobObj
.
setResults
(
results
)
if
status
==
(
"
OK
"
):
self
.
jobObj
.
setPhase
(
"
COMPLETED
"
)
self
.
dbConn
.
insertJob
(
self
.
jobObj
)
self
.
dbConn
.
setEndTime
(
self
.
jobId
)
self
.
jobObj
.
endTime
=
datetime
.
datetime
.
now
().
isoformat
()
self
.
logger
.
info
(
"
Job phase updated to COMPLETED.
"
)
# Add a list of physical destination paths for each VOSpace node in the node list
self
.
logger
.
info
(
"
Generating physical destination paths for VOSpace nodes...
"
)
for
vospacePath
in
self
.
nodeList
:
nodeInfo
=
self
.
dbConn
.
getOSPath
(
vospacePath
)
baseSrcPath
=
nodeInfo
[
"
baseSrcPath
"
]
...
...
@@ -276,13 +342,9 @@ class RetrieveExecutor(TaskExecutor):
self
.
destPathList
.
append
(
destPath
)
self
.
jobObj
.
jobInfo
[
"
destPathList
"
]
=
self
.
destPathList
.
copy
()
m
=
Mailer
(
self
.
logger
)
m
.
addRecipient
(
self
.
adminEmail
)
userEmail
=
self
.
dbConn
.
getUserEmail
(
self
.
jobObj
.
ownerId
)
if
userEmail
!=
self
.
adminEmail
:
m
.
addRecipient
(
userEmail
)
msg
=
f
"""
########## VOSpace data retrieval procedure summary ##########
Dear user,
your job has been COMPLETED.
...
...
@@ -293,15 +355,46 @@ class RetrieveExecutor(TaskExecutor):
Your files are available and can be downloaded.
"""
m
.
setMessage
(
"
VOSpace data retrieve notification: COMPLETED
"
,
msg
)
else
:
self
.
jobObj
.
setPhase
(
"
ERROR
"
)
self
.
jobObj
.
setErrorType
(
"
fatal
"
)
self
.
jobObj
.
setErrorMessage
(
"
FATAL: something went wrong during the execution phase.
"
)
self
.
dbConn
.
insertJob
(
self
.
jobObj
)
self
.
dbConn
.
setEndTime
(
self
.
jobId
)
self
.
jobObj
.
endTime
=
datetime
.
datetime
.
now
().
isoformat
()
self
.
logger
.
info
(
"
Job phase updated to ERROR.
"
)
msg
=
f
"""
########## VOSpace data retrieval procedure summary ##########
Dear user,
your job has FAILED.
Job ID:
{
self
.
jobId
}
Job type:
{
self
.
jobObj
.
type
}
Owner ID:
{
self
.
jobObj
.
ownerId
}
"""
info
=
f
"""
ERROR:
the job was terminated due to an error that occurred
while retrieveing the data from the storage point.
This issue will be automatically reported to the administrator.
"""
msg
+=
info
m
.
setMessage
(
"
VOSpace data retrieve notification: ERROR
"
,
msg
)
# Send e-mail notification
m
.
setMessage
(
"
VOSpace data retrieve notification: Job COMPLETED
"
,
msg
)
m
.
send
()
except
Exception
:
self
.
logger
.
exception
(
f
"
FATAL: unable to update the database, job ID:
{
self
.
jobId
}
"
)
def
cleanup
(
self
):
"""
Cleanup method.
"""
self
.
logger
.
info
(
"
Cleanup...
"
)
self
.
fileList
.
clear
()
self
.
nodeList
.
clear
()
self
.
destPathList
.
clear
()
...
...
@@ -316,24 +409,31 @@ class RetrieveExecutor(TaskExecutor):
self
.
setDestinationQueueName
(
"
read_terminated
"
)
while
True
:
self
.
wait
()
if
self
.
srcQueue
.
len
()
>
0
:
try
:
srcQueueLen
=
self
.
srcQueue
.
len
()
destQueueLen
=
self
.
destQueue
.
len
()
except
Exception
:
self
.
logger
.
exception
(
"
Cache error: failed to retrieve queue length.
"
)
else
:
if
srcQueueLen
>
0
:
self
.
jobObj
=
self
.
srcQueue
.
getJob
()
self
.
jobId
=
self
.
jobObj
.
jobId
self
.
nodeList
=
self
.
jobObj
.
nodeList
.
copy
()
self
.
buildFileList
()
self
.
buildBlocks
()
result
=
self
.
retrieveData
()
if
result
:
self
.
update
()
self
.
cleanup
()
if
self
.
execute
():
self
.
update
(
"
OK
"
)
# debug block...
print
(
f
"
fileList =
{
self
.
fileList
}
"
)
print
(
f
"
nodeList =
{
self
.
nodeList
}
"
)
#
print(f"fileList = {self.fileList}")
#
print(f"nodeList = {self.nodeList}")
else
:
sys
.
exit
(
"
Failed to retrieve data!
"
)
if
self
.
destQueue
.
len
()
==
self
.
maxTerminatedJobs
:
self
.
update
(
"
ERROR
"
)
self
.
cleanup
()
try
:
if
destQueueLen
>=
self
.
maxTerminatedJobs
:
self
.
destQueue
.
extractJob
()
self
.
destQueue
.
insertJob
(
self
.
jobObj
)
self
.
srcQueue
.
extractJob
()
except
Exception
:
self
.
logger
.
exception
(
f
"
Failed to move job
{
self
.
jobObj
.
jobId
}
from
'
{
self
.
srcQueue
.
name
()
}
'
to
'
{
self
.
destQueue
.
name
()
}
'"
)
else
:
self
.
logger
.
info
(
f
"
Job
{
self
.
jobObj
.
jobId
}
MOVED from
'
{
self
.
srcQueue
.
name
()
}
'
to
'
{
self
.
destQueue
.
name
()
}
'"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment