Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add error messaging logging for ams jobs SO107 SCMSUITE-8725 #182

Merged
merged 12 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ htmlcov/
nosetests.xml
coverage.xml
*,cover
unit_tests/result_images/*

# Translations
*.mo
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ This changelog is effective from the 2025 releases.
* GitHub workflows for CI and publishing to PyPI
* Build using `pyproject.toml`, addition of extras groups to install optional dependencies
* Logging of job summaries to CSV logfile
* Logging of AMS job error messages to stdout and logfile on job failure

### Changed
* Functions for optional packages (e.g. RDKit, ASE) are available even when these packages are not installed, but will raise an `MissingOptionalPackageError` when called
Expand Down
4 changes: 2 additions & 2 deletions core/formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def _format_job(job: Job) -> Dict[str, Any]:
"job_base_name": re.sub(r"\.\d+$", "", job.name),
"job_name": job.name,
"job_status": job.status,
"job_parent_name": "",
"job_parent_path": "",
"job_path": "",
"job_ok": "",
"job_check": "",
"job_get_errormsg": "",
"job_parent_name": "",
"job_parent_path": "",
}

if job.status not in [JobStatus.CREATED, JobStatus.STARTED]:
Expand Down
1 change: 1 addition & 0 deletions core/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def finish(otherJM: Optional[Iterable["JobManager"]] = None):

# Register call to _finish on workflow end
atexit.register(_finish)
atexit.register(_logger.close)


# ===========================================================================
Expand Down
18 changes: 18 additions & 0 deletions core/jobrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,25 @@ def _run_job(self, job, jobmanager):
job._execute(self)
job._finalize()
finally:
# Log job summaries
try:
# Log any error messages to the standard logger
if not job.ok(False) or not job.check():
# get_errormsg is not required by the base job class, but often implemented by convention
err_msg = (
job.get_errormsg()
if hasattr(job, "get_errormsg")
else "Could not determine error message. Please check the output manually."
)
err_lines = err_msg.splitlines()
max_lines = 30
if len(err_lines) > max_lines:
err_lines = err_lines[:max_lines]
err_lines.append("... (see output for full error)")
err_msg = str.join("\n", [f"\t{l}" for l in err_lines])
log(f"""Error message for job {job.name} was:\n{err_msg}""", 3)

# Log job summary to the csv logger
jobmanager.job_logger.log(job, level=3)
except: # logging should never throw, but best to make sure
pass
Expand Down
2 changes: 1 addition & 1 deletion core/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def format(self, record: logging.LogRecord) -> str:
if self.include_level:
log_record["level"] = 28 - record.levelno
if self.log_time:
log_record["asctime"] = self.formatTime(record, self.datefmt)
log_record["logged_at"] = self.formatTime(record, self.datefmt)

if isinstance(record.msg, dict):
log_record.update(record.msg)
Expand Down
13 changes: 12 additions & 1 deletion core/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,21 @@ def guardian(self, *args, **kwargs):
raise ResultsError("Using Results associated with deleted job")

elif self.job.status in [JobStatus.CRASHED, JobStatus.FAILED]:
if func.__name__ == "wait": # waiting for crashed of failed job should not trigger any warnings/exceptions
# waiting for crashed of failed job should not trigger any warnings/exceptions
# and neither should checking the status from this job with 'ok', 'check' or 'get_errormsg'
suppress_errors = func.__name__ == "wait"
if not suppress_errors:
for frame in inspect.getouterframes(inspect.currentframe()):
cal, arg = _caller_name_and_arg(frame[0])
if arg == self.job and cal in ["ok", "check", "get_errormsg"]:
suppress_errors = True
break

if suppress_errors:
cal, arg = _caller_name_and_arg(inspect.currentframe())
if isinstance(arg, Results):
return func(self, *args, **kwargs)

if config.ignore_failure:
log("WARNING: Trying to obtain results of crashed or failed job {}".format(self.job.name), 3)
try:
Expand Down
58 changes: 46 additions & 12 deletions interfaces/adfsuite/ams.py
Original file line number Diff line number Diff line change
Expand Up @@ -2483,27 +2483,61 @@ def get_errormsg(self) -> Optional[str]:
if self.check():
return None
else:
# Something went wrong. The first place to check is the termination status on the ams.rkf.
# If the AMS driver stopped with a known error (called StopIt in the Fortran code), the error will be in there.
default_msg = "Could not determine error message. Please check the output manually."
msg = None
try:
msg = self.results.readrkf("General", "termination status")
if msg == "NORMAL TERMINATION with errors" or msg is None:
# Apparently this wasn't a hard stop in the middle of the job.
# Let's look for the last error in the logfile ...
msg = self.results.grep_file("ams.log", "ERROR: ")[-1].partition("ERROR: ")[2]
elif msg == "IN PROGRESS" and "$JN.err" in self.results:
# Something went wrong. The first place to check is the termination status on the ams.rkf.
# If the AMS driver stopped with a known error (called StopIt in the Fortran code), the error will be in there.
# Status can be:
# - NORMAL TERMINATION with errors: find the error from the ams log file
# - IN PROGRESS: probably means AMS was shut down hard from the outside
# e.g. it got SIGKILL from the scheduler for exceeding some resource limit
# find the last error from the stderr
# Note AMS can crash before even creating an rkf, then can just check the output and error files.
try:
termination_status = self.results.readrkf("General", "termination status")
except FileError:
termination_status = None

# First look for the last error in the logfile
try:
log_err_lines = self.results.grep_file("ams.log", "ERROR: ")
if log_err_lines:
return log_err_lines[-1].partition("ERROR: ")[2]
except FileError:
pass

# Then for a licensing issue, check the output logs directly
try:
license_err_lines = self.results.get_output_chunk(
begin="LICENSE INVALID",
end="License file",
inc_begin=True,
inc_end=True,
match=1,
)
if license_err_lines:
return str.join("\n", license_err_lines)
except FileError:
pass

# For any other issue fall back to the error file directly
if "$JN.err" in self.results:
# If the status is still "IN PROGRESS", that probably means AMS was shut down hard from the outside.
# E.g. it got SIGKILL from the scheduler for exceeding some resource limit.
# In this case useful information may be found on stderr.
with open(self.results["$JN.err"], "r") as err:
with open(self.results["$JN.err"]) as err:
errlines = err.read().splitlines()
for el in reversed(errlines):
if el != "" and not el.isspace():
msg = "Killed while IN PROGRESS: " + el
msg = (
f"Termination status: {termination_status}. Message: {el} . "
f"Check the files in {self.path} for more details."
)
break
except:
msg = "Could not determine error message. Please check the output manually."
return msg
pass
return msg if msg else default_msg

def hash_input(self) -> str:
"""Calculate the hash of the input file.
Expand Down
48 changes: 48 additions & 0 deletions unit_tests/test_amsjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,3 +563,51 @@ def get_runscript() -> str:
assert len(status_lines) == 4

logger.close()

def test_get_errormsg_populates_correctly_for_different_scenarios(self):
from scm.plams.core.errors import FileError

# Invalid license
job = AMSJob()
results = MagicMock(spec=AMSResults)
results.readrkf.side_effect = FileError()
results.grep_file.side_effect = FileError()
results.get_output_chunk.return_value = [
"LICENSE INVALID",
"---------------",
"Your license does not include module AMS version 2024.206 on this machine.",
"Module AMS",
"Version 2024.206",
"Machine: Foo",
"License file: ./license.txt",
]
job.results = results

assert (
job.get_errormsg()
== """LICENSE INVALID
---------------
Your license does not include module AMS version 2024.206 on this machine.
Module AMS
Version 2024.206
Machine: Foo
License file: ./license.txt"""
)

# Invalid input
results.grep_file.side_effect = None
results.grep_file.return_value = [
'<Dec05-2024> <12:03:49> ERROR: Input error: value "foo" found in line 13 for multiple choice key "Task" is not an allowed choice'
]
assert (
job.get_errormsg()
== 'Input error: value "foo" found in line 13 for multiple choice key "Task" is not an allowed choice'
)

# Error in calculation
results.readrkf.return_value = "NORMAL TERMINATION with errors"
results.readrkf.side_effect = None
results.grep_file.return_value = [
"<Dec05-2024> <13:44:55> ERROR: Geometry optimization failed! (Did not converge.)"
]
assert job.get_errormsg() == "Geometry optimization failed! (Did not converge.)"
31 changes: 30 additions & 1 deletion unit_tests/test_basejob.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import namedtuple
import shutil
import re
from io import StringIO

from scm.plams.core.settings import Settings
from scm.plams.core.basejob import SingleJob
Expand Down Expand Up @@ -71,6 +72,9 @@ def get_runscript(self) -> str:
def check(self) -> bool:
return self.results.read_file(self._filename("err")) == ""

def get_errormsg(self) -> str:
return self.results.read_file(self._filename("err"))


class TestSingleJob:
"""
Expand Down Expand Up @@ -297,7 +301,7 @@ def test_job_summaries_logged(self, config):
with open(job_manager.job_logger.logfile) as f:
assert (
f.readline()
== """asctime,job_base_name,job_name,job_status,job_parent_name,job_parent_path,job_path,job_ok,job_check,job_get_errormsg
== """logged_at,job_base_name,job_name,job_status,job_path,job_ok,job_check,job_get_errormsg,job_parent_name,job_parent_path
"""
)

Expand All @@ -315,3 +319,28 @@ def test_job_summaries_logged(self, config):

job_manager.job_logger.close()
shutil.rmtree(job_manager.workdir)

def test_job_errors_logged_to_stdout(self, config):
from scm.plams.core.logging import get_logger

logger = get_logger(f"plams-{uuid.uuid4()}")

with patch("scm.plams.core.functions._logger", logger):
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
job1 = DummySingleJob(cmd="not_a_cmd")
job2 = DummySingleJob(cmd="x\n" * 50)

job1.run()
job2.run()

stdout = mock_stdout.getvalue()
assert re.match(
f".*Error message for job {job1.name} was:.* 3: not_a_cmd: (command ){{0,1}}not found",
stdout,
re.DOTALL,
)
assert re.match(
f".*Error message for job {job2.name} was:.* 3: x: (command ){{0,1}}not found.* 32: x: (command ){{0,1}}not found.*(see output for full error)",
stdout,
re.DOTALL,
)
58 changes: 52 additions & 6 deletions unit_tests/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def test_configure_adds_fields_to_logging(self):
)
logger.close()

def test_logger_gets_field_names_from_dictionary(self):
def test_log_gets_field_names_from_dictionary(self):
with temp_file_path(suffix=".log") as temp_log_file:
name = str(uuid.uuid4())
logger = get_logger(name, "csv")
Expand All @@ -541,12 +541,12 @@ def test_logger_gets_field_names_from_dictionary(self):
)
logger.close()

def test_logger_cannot_change_field_names_or_add_fields(self):
def test_log_cannot_change_field_names_or_add_fields(self):
with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
name = str(uuid.uuid4())
logger = get_logger(name, "csv")
logger.configure(3, 3)
logger.configure(3)

logger.log({"foo": "bar", "fizz": "buzz"}, 3)
logger.log({"abc": "bar2", "def": "buzz2"}, 3)
Expand All @@ -559,6 +559,52 @@ def test_logger_cannot_change_field_names_or_add_fields(self):
"""
)

logger.close()

def test_log_correctly_escapes_commas_and_multiline_strings(self):
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
name = str(uuid.uuid4())
logger = get_logger(name, "csv")
logger.configure(3)

logger.log(
{
"commas": "a,b,c,'(d,e,f)'",
"multi-line": """See the following:
a,b,c
d,e,f
""",
},
3,
)
logger.log(
{
"commas": 'm,"n,",,,o\'',
"multi-line": """See the following:
m,n,o

p,q,r
""",
},
3,
)

assert mock_stdout.getvalue() == (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screenshot 2024-12-06 at 12 54 23

"""commas,multi-line
"a,b,c,\'(d,e,f)\'","See the following:
a,b,c
d,e,f
"
"m,""n,"",,,o\'","See the following:
m,n,o

p,q,r
"
"""
)

logger.close()


class TestJobCSVFormatter:

Expand Down Expand Up @@ -592,11 +638,11 @@ def get_errormsg():
with open(temp_log_file) as tf:
assert (
tf.read()
== f"""job_base_name,job_name,job_status,job_parent_name,job_parent_path,job_path,job_ok,job_check,job_get_errormsg
== f"""job_base_name,job_name,job_status,job_path,job_ok,job_check,job_get_errormsg,job_parent_name,job_parent_path
test_job_csv_formatter,test_job_csv_formatter,created,,,,,,
test_job_csv_formatter,test_job_csv_formatter.002,created,,,,,,
test_job_csv_formatter,test_job_csv_formatter,successful,,,{path1},True,True,
test_job_csv_formatter,test_job_csv_formatter.002,crashed,,,{path2},False,False,some error
test_job_csv_formatter,test_job_csv_formatter,successful,{path1},True,True,,,
test_job_csv_formatter,test_job_csv_formatter.002,crashed,{path2},False,False,some error,,
"""
)

Expand Down
Loading