Created
May 19, 2025 03:59
-
-
Save manodeep/13bed6e7f74d38ddf90a7a4f3d888cb9 to your computer and use it in GitHub Desktop.
A python code to filter out (gadi-specific) MPI library errors reported by valgrind
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Written by Manodeep Sinha, May 2025 to filter out valgrind error logs. | |
# Regex idea extended from https://stackoverflow.com/a/34407168 | |
# Untested. Use at your own risk. - Manodeep Sinha, May 19, 2025 | |
import fileinput | |
import re | |
def main(discard_string_list=None, always_print_if_main_is_present=False): | |
""" | |
# Defines an error chunk as follows: | |
# 1. Starts with a line matching the START regex (which is usually a type of valgrind reported error) | |
# 2. Ends with a line matching the STOP regex - which is the line with just "==<pid>== " (i.e., empty line otherwise) | |
# 3. Contains at least one line matching the GOOD regex (which is usually a source file name and a line number) | |
While this will parse the error logs into error chunks, it will not filter out the errors that are not relevant to the user. To achieve that, we look for | |
matching substrings in the last two lines of the error chunk. If any of the matching substrings are present, we discard the entire error chunk. | |
Really, what should happen is the corresponding error that we would use valgrind suppression files, however, that requires repeatedly running valgrind | |
with updated suppression file. Instead, we will just filter out the error chunks that are not relevant to the user using the *same* valgrind log (i.e., run application only once). | |
# Known limitation: | |
- Since the filtering line is pretty crude and only looks for matching substrings in the last two lines of the error chunk, it is possible that real errors with | |
source in user-code, but execution in library code is filtered out. As a way of reducing the chances of this happening, there is a boolean parameter ``always_print_if_main_is_present`` to print | |
all error chunks that contain "main" anywhere in the error chunk. This is a good way to catch errors that are not in the user code, but are still relevant to the user. | |
""" | |
START = re.compile(r"(Conditional jump or move depends on uninitialised value|Use of uninitialised value|Syscall param|Invalid free|Invalid write|Invalid read)") | |
STOP = re.compile(r"^==\d+== $") | |
GOOD = re.compile(r"\.(f90|c|cpp|cxx|F90):\d+", re.M) | |
## These are specific to gadi and to the ESM1.6 PI standalone setup I (MS) am using in May 2025 | |
## If these strings are present in the last two lines of the error chunk, then discard the entire error. | |
if not discard_string_list: | |
discard_string_list = ["ucp_worker_iface_open", "uct_rc_iface_verbs_init_rx", "PMPI_Init", "MPI_COMM_SPLIT", | |
"mpi_comm_split", "ompi_comm_split_with_info", "opal_hwloc_base_get_topology", | |
"libuct_xpmem.so.0.0.0", "librdmacm.so.1.3.56.0", "libibverbs.so.1.14.56.0", | |
"libmlx5.so.1.25.56.0", "/usr/lib64/libc-2.28.so"] | |
in_line = False | |
error_chunk = [] | |
for line in fileinput.input(): | |
if in_line: | |
in_line = not STOP.search(line) | |
else: | |
in_line = START.search(line) | |
if in_line: | |
error_chunk.append(line) | |
else: | |
match = GOOD.findall("".join(error_chunk)) | |
if len(match) > 2: | |
last_two_lines = "".join(error_chunk[-2:]) | |
printable_error_chunk = "".join(error_chunk) | |
discard_string_present = any(s in last_two_lines for s in discard_string_list) | |
do_print = not discard_string_present | |
if (not do_print) and always_print_if_main_is_present: | |
do_print = "main" in printable_error_chunk | |
if do_print: | |
print(printable_error_chunk) | |
error_chunk = [] | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment