Source code for lammps_step.gpu_memory_parser

"""
gpu_memory_parser.py

Parse nvidia-smi memory monitor logs produced by gpu_bind.sh and summarize
GPU memory usage and utilization, including multi-GPU summaries.

The command for nvidia-smi looks like this::

    # Start background memory monitor, writing peak to a log file
    MEMORY_LOG="${SEAMM_MEMORY_LOG:-./gpu_${GPU_ID}_rank_${LOCAL_RANK}.log}"

    nvidia-smi --query-gpu=timestamp,memory.used,memory.free,utilization.gpu \
               --format=csv -l 5 -i "$GPU_ID" > "$MEMORY_LOG" &
"""

import re
from pathlib import Path
from datetime import datetime


[docs] def parse_gpu_memory_log(log_path): """ Parse a gpu_bind.sh memory monitor log file and return summary statistics. Initialization phase: samples before GPU utilization first becomes non-zero AND memory has stabilized (no longer growing significantly). Production phase: all samples after initialization. Args: log_path: Path to the nvidia-smi log file Returns: dict with statistics, or None if file cannot be parsed """ log_path = Path(log_path) if not log_path.exists(): return None # Try to extract GPU index and rank from filename # Expected patterns: gpu_0_rank_0.log, gpu_memory_rank0.log, etc. gpu_id = None rank = None name = log_path.stem m = re.search(r"gpu[_]?(\d+)", name, re.IGNORECASE) if m: gpu_id = int(m.group(1)) m = re.search(r"rank[_]?(\d+)", name, re.IGNORECASE) if m: rank = int(m.group(1)) samples = [] with open(log_path) as f: for line in f: line = line.strip() # Skip header and "Done!" line if not line or line.startswith("timestamp") or not line[0].isdigit(): continue try: parts = [p.strip() for p in line.split(",")] timestamp = datetime.strptime(parts[0], "%Y/%m/%d %H:%M:%S.%f") mem_used = int(parts[1].split()[0]) mem_free = int(parts[2].split()[0]) util = int(parts[3].split()[0]) samples.append( { "timestamp": timestamp, "mem_used": mem_used, "mem_free": mem_free, "util": util, } ) except (ValueError, IndexError): continue if not samples: return None # Find where initialization ends: # First sample where utilization > 0 AND memory has stabilized # Memory is "stable" when the increase from previous sample is small (<5%) MEM_GROWTH_THRESHOLD = 0.05 # 5% growth = still initializing init_end_idx = None for i, s in enumerate(samples): if s["util"] == 0: continue # Utilization is non-zero -- check if memory is still growing if i > 0: prev_mem = samples[i - 1]["mem_used"] if prev_mem > 0: growth = (s["mem_used"] - prev_mem) / prev_mem if growth > MEM_GROWTH_THRESHOLD: continue # still initializing -- memory growing fast init_end_idx = i break # Initialization phase init_samples = samples[:init_end_idx] if init_end_idx is not None else samples prod_samples = samples[init_end_idx:] if init_end_idx is not None else [] # Initialization duration if len(init_samples) >= 1 and init_end_idx is not None: init_duration_s = ( samples[init_end_idx]["timestamp"] - samples[0]["timestamp"] ).total_seconds() elif len(init_samples) >= 2: init_duration_s = ( init_samples[-1]["timestamp"] - init_samples[0]["timestamp"] ).total_seconds() else: init_duration_s = None # Total run duration if len(samples) >= 2: total_duration_s = ( samples[-1]["timestamp"] - samples[0]["timestamp"] ).total_seconds() else: total_duration_s = None # Production statistics if prod_samples: prod_mem_used = [s["mem_used"] for s in prod_samples] prod_mem_free = [s["mem_free"] for s in prod_samples] prod_util = [s["util"] for s in prod_samples] prod_stats = { "max_memory_used_mb": max(prod_mem_used), "min_memory_free_mb": min(prod_mem_free), "avg_gpu_utilization": sum(prod_util) / len(prod_util), "max_gpu_utilization": max(prod_util), "n_samples": len(prod_samples), } else: prod_stats = None return { "log_path": str(log_path), "gpu_id": gpu_id, "rank": rank, "init_duration_s": init_duration_s, "init_n_samples": len(init_samples), "total_duration_s": total_duration_s, "total_n_samples": len(samples), "production": prod_stats, "samples": samples, }
if __name__ == "__main__": import sys import glob if len(sys.argv) < 2: print("Usage: python gpu_memory_parser.py <log_file> [log_file2 ...]") print(" python gpu_memory_parser.py 'gpu_*.log'") sys.exit(1) # Expand globs paths = [] for pattern in sys.argv[1:]: expanded = glob.glob(pattern) if not expanded: paths.append(pattern) # keep as-is, will fail gracefully else: paths.extend(expanded) if len(paths) == 0: stats = parse_gpu_memory_log(paths[0]) print_gpu_summary(stats) else: print_multi_gpu_summary(paths)