From 5d2722de9290472bb8fbd120d1ec506f7765e209 Mon Sep 17 00:00:00 2001 From: Siho Shin Date: Sat, 27 Jun 2026 08:59:14 +0900 Subject: asdf --- decompress/analyze_bw.py | 345 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100755 decompress/analyze_bw.py (limited to 'decompress/analyze_bw.py') diff --git a/decompress/analyze_bw.py b/decompress/analyze_bw.py new file mode 100755 index 0000000..e4fab84 --- /dev/null +++ b/decompress/analyze_bw.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import sys +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +KIBPS_TO_UNIT = { + "KiBps": 1.0, + "MiBps": 1.0 / 1024.0, + "GiBps": 1.0 / (1024.0 * 1024.0), +} + + +UNIT_DISPLAY = { + "KiBps": "KiB/s", + "MiBps": "MiB/s", + "GiBps": "GiB/s", +} + + +def format_bytes(n) -> str: + """ + Format size using the largest readable unit. + + Examples: + 246132K -> 246M + 2097148K -> 2.0G + + Notes: + - CSV normally gives integer bytes. + - This also accepts strings like "246132K" if needed. + - M is shown as rounded whole MB. + - G/T are shown with one decimal place. + """ + if isinstance(n, str): + s = n.strip() + suffix = s[-1].upper() + + if suffix in {"K", "M", "G", "T"}: + value = float(s[:-1]) + # Interpret suffix input as decimal-style units. + scale = { + "K": 1_000, + "M": 1_000_000, + "G": 1_000_000_000, + "T": 1_000_000_000_000, + }[suffix] + n = int(value * scale) + else: + n = int(s) + + n = int(n) + + # Use G/T when the value is large enough. + if n >= 1024 ** 4: + return f"{n / (1024 ** 4):.2f}T" + + if n >= 1024 ** 3: + return f"{n / (1024 ** 3):.2f}G" + + if n >= 1_000_000: + return f"{round(n / 1_000_000)}M" + + if n >= 1_000: + return f"{round(n / 1_000)}K" + + return f"{n}B" + +def experiment_label(path: str) -> str: + name = Path(path).name + + if name.startswith("bw_") and name.endswith(".csv"): + return name[len("bw_") : -len(".csv")] + + if name.startswith("bandwidth_") and name.endswith(".csv"): + return name[len("bandwidth_") : -len(".csv")] + + return Path(path).stem + + +def percentile(values, p: float) -> float: + return float(np.percentile(np.array(values, dtype=float), p)) + + +def read_bw_csv(path: str, unit: str): + grouped = defaultdict(list) + + with open(path, newline="") as f: + reader = csv.DictReader(f) + + if reader.fieldnames is None: + raise ValueError(f"{path}: empty CSV or missing header") + + fieldnames = [name.strip() for name in reader.fieldnames] + if fieldnames != ["bytes", "bw_KiBps"]: + raise ValueError( + f"{path}: expected header 'bytes,bw_KiBps', got: {','.join(fieldnames)}" + ) + + for line_no, row in enumerate(reader, start=2): + try: + nbytes = int(row["bytes"].strip()) + bw_kibps = float(row["bw_KiBps"].strip()) + except Exception as e: + raise ValueError(f"{path}: invalid row at line {line_no}: {row} ({e})") + + grouped[nbytes].append(bw_kibps * KIBPS_TO_UNIT[unit]) + + return grouped + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Create grouped bandwidth bar graph from bw_*.csv files." + ) + parser.add_argument( + "inputs", + nargs="+", + help="Input CSV files with header: bytes,bw_KiBps. Recommended naming: bw_