diff options
Diffstat (limited to 'decompress/analyze_time.py')
| -rwxr-xr-x | decompress/analyze_time.py | 332 |
1 files changed, 332 insertions, 0 deletions
diff --git a/decompress/analyze_time.py b/decompress/analyze_time.py new file mode 100755 index 0000000..8f467e4 --- /dev/null +++ b/decompress/analyze_time.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import sys +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +USEC_TO_UNIT = { + "sec": 1.0 / 1_000_000.0, + "msec": 1.0 / 1_000.0, + "usec": 1.0, + "nsec": 1_000.0, +} + + +LINESTYLES = ["-", "--", "-.", ":"] +MARKERS = ["o", "s", "^", "D", "v", "P", "X", "*"] + +def format_bytes(n) -> str: + """ + Format size using the largest readable unit. + + Examples: + 246132K -> 246M + 2097148K -> 2.0G + + Notes: + - CSV normally gives integer bytes. + - This also accepts strings like "246132K" if needed. + - M is shown as rounded whole MB. + - G/T are shown with one decimal place. + """ + if isinstance(n, str): + s = n.strip() + suffix = s[-1].upper() + + if suffix in {"K", "M", "G", "T"}: + value = float(s[:-1]) + # Interpret suffix input as decimal-style units. + scale = { + "K": 1_000, + "M": 1_000_000, + "G": 1_000_000_000, + "T": 1_000_000_000_000, + }[suffix] + n = int(value * scale) + else: + n = int(s) + + n = int(n) + + # Use G/T when the value is large enough. + if n >= 1024 ** 4: + return f"{n / (1024 ** 4):.2f}T" + + if n >= 1024 ** 3: + return f"{n / (1024 ** 3):.2f}G" + + if n >= 1_000_000: + return f"{round(n / 1_000_000)}M" + + if n >= 1_000: + return f"{round(n / 1_000)}K" + + return f"{n}B" + +def experiment_label(path: str) -> str: + name = Path(path).name + + if name.startswith("times_") and name.endswith(".csv"): + return name[len("times_") : -len(".csv")] + + return Path(path).stem + + +def percentile(values, p: float) -> float: + return float(np.percentile(np.array(values, dtype=float), p)) + + +def read_times_csv(path: str, unit: str): + grouped = defaultdict(list) + + with open(path, newline="") as f: + reader = csv.DictReader(f) + + if reader.fieldnames is None: + raise ValueError(f"{path}: empty CSV or missing header") + + fieldnames = [name.strip() for name in reader.fieldnames] + if fieldnames != ["bytes", "time_usec"]: + raise ValueError( + f"{path}: expected header 'bytes,time_usec', got: {','.join(fieldnames)}" + ) + + for line_no, row in enumerate(reader, start=2): + try: + nbytes = int(row["bytes"].strip()) + time_usec = float(row["time_usec"].strip()) + except Exception as e: + raise ValueError(f"{path}: invalid row at line {line_no}: {row} ({e})") + + grouped[nbytes].append(time_usec * USEC_TO_UNIT[unit]) + + return grouped + + +def add_legend_and_save(fig, ax, out_path: Path, legend_title: str, legend_outside: bool): + if legend_outside: + ax.legend( + title=legend_title, + fontsize=8, + loc="center left", + bbox_to_anchor=(1.02, 0.5), + ) + fig.tight_layout(rect=[0, 0, 0.78, 1]) + else: + ax.legend(title=legend_title, fontsize=8) + fig.tight_layout() + + fig.savefig(out_path, dpi=200) + plt.close(fig) + + +def plot_size_graph( + nbytes: int, + series, + unit: str, + out_path: Path, + markers_mode: str, + line_markers: bool, + legend_outside: bool, +): + size_label = format_bytes(nbytes) + fig, ax = plt.subplots(figsize=(10, 6)) + + for item in series: + x = item["x"] + y = item["y"] + marker = item["marker"] if line_markers else None + + ax.plot( + x, + y, + linestyle=item["linestyle"], + marker=marker, + markersize=3, + markevery=max(1, len(x) // 25), + linewidth=1.8, + label=f"{item['file_label']} (n={item['n']})", + ) + + if markers_mode == "tail": + ax.scatter([item["p90"]], [0.90], marker="x", s=70, zorder=5) + ax.scatter([item["p99"]], [0.99], marker="*", s=100, zorder=5) + + ax.annotate( + "P90", + xy=(item["p90"], 0.90), + xytext=(5, 5), + textcoords="offset points", + fontsize=8, + ) + ax.annotate( + "P99", + xy=(item["p99"], 0.99), + xytext=(5, 5), + textcoords="offset points", + fontsize=8, + ) + + ax.set_xlabel(f"Latency ({unit})") + ax.set_ylabel("CDF") + ax.set_title(f"DMA Latency CDF - {size_label}") + ax.grid(True) + + add_legend_and_save( + fig, + ax, + out_path, + "Experiment", + legend_outside, + ) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Create separate latency CDF graphs per byte count from times_*.csv files." + ) + parser.add_argument( + "inputs", + nargs="+", + help="Input CSV files with header: bytes,time_usec. Recommended naming: times_<label>.csv", + ) + parser.add_argument( + "--unit", + default="usec", + choices=sorted(USEC_TO_UNIT.keys()), + help="Output time unit. Input is always time_usec. Default: usec", + ) + parser.add_argument( + "--out", + default="latency_cdf.png", + help="Output base filename. Per-size graphs are generated from this name.", + ) + parser.add_argument( + "--markers", + default="none", + choices=["none", "tail"], + help="Marker mode. 'none' disables percentile markers; 'tail' shows P90 and P99 markers. Default: none", + ) + parser.add_argument( + "--legend-outside", + action="store_true", + help="Place legend outside the plot.", + ) + parser.add_argument( + "--line-markers", + action="store_true", + help="Show point markers on CDF lines. Default: no line markers.", + ) + + args = parser.parse_args() + + all_data = [] + + for path in args.inputs: + try: + grouped = read_times_csv(path, args.unit) + except ValueError as e: + print(f"ERROR: {e}", file=sys.stderr) + return 1 + + if not grouped: + print(f"ERROR: {path}: no data rows", file=sys.stderr) + return 1 + + all_data.append( + { + "path": path, + "label": experiment_label(path), + "grouped": grouped, + } + ) + + print("Latency summary") + print("================") + print(f"input_files: {len(all_data)}") + print(f"input_unit: usec") + print(f"output_unit: {args.unit}") + print() + + print( + f"{'file_label':>18} {'bytes':>14} {'size':>8} {'n':>8} " + f"{'min':>12} {'avg':>12} {'std':>12} " + f"{'p50':>12} {'p90':>12} {'p99':>12} {'max':>12}" + ) + + series_by_size = defaultdict(list) + + for file_idx, item in enumerate(all_data): + file_label = item["label"] + grouped = item["grouped"] + linestyle = LINESTYLES[file_idx % len(LINESTYLES)] + + for size_idx, nbytes in enumerate(sorted(grouped.keys())): + times = np.array(grouped[nbytes], dtype=float) + sorted_times = np.sort(times) + cdf = np.arange(1, len(sorted_times) + 1) / len(sorted_times) + + min_v = float(np.min(times)) + avg = float(np.mean(times)) + std = float(np.std(times)) + p50 = percentile(times, 50) + p90 = percentile(times, 90) + p99 = percentile(times, 99) + max_v = float(np.max(times)) + + size_label = format_bytes(nbytes) + + print( + f"{file_label:>18} {nbytes:14d} {size_label:>8} {len(times):8d} " + f"{min_v:12.6f} {avg:12.6f} {std:12.6f} " + f"{p50:12.6f} {p90:12.6f} {p99:12.6f} {max_v:12.6f}" + ) + + series_by_size[nbytes].append( + { + "file_label": file_label, + "nbytes": nbytes, + "size_label": size_label, + "x": sorted_times, + "y": cdf, + "n": len(times), + "p90": p90, + "p99": p99, + "linestyle": linestyle, + "marker": MARKERS[size_idx % len(MARKERS)], + } + ) + + out_base = Path(args.out) + stem = out_base.stem + suffix = out_base.suffix or ".png" + + print() + + for nbytes in sorted(series_by_size.keys()): + size_label = format_bytes(nbytes) + out_path = out_base.with_name(f"{stem}_{size_label}{suffix}") + + plot_size_graph( + nbytes=nbytes, + series=series_by_size[nbytes], + unit=args.unit, + out_path=out_path, + markers_mode=args.markers, + line_markers=args.line_markers, + legend_outside=args.legend_outside, + ) + + print(f"Wrote CDF graph: {out_path}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) |
