Source code for zea.data.convert.camus

"""Convert the CAMUS dataset to the zea format.

.. note::

   Requires SimpleITK: ``pip install SimpleITK``.

CAMUS (Cardiac Acquisitions for Multi-structure Ultrasound Segmentation) is a
public dataset containing 2-D echocardiographic sequences from 500 patients.
Sequences are stored in NIfTI (``.nii.gz``) format and include both 2-chamber
(2CH) and 4-chamber (4CH) apical views.

Dataset splits:

* **Train** - patients 1-400
* **Validation** - patients 401-450
* **Test** - patients 451-500

.. admonition:: License

   CC BY-NC-SA 4.0 - https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode

   The CAMUS dataset is available free of charge strictly for non-commercial
   scientific research purposes only.

.. admonition:: Reference

   S\\. Leclerc, E. Smistad, J. Pedrosa, A. Ostvik, F. Cervenansky, F. Espinosa,
   T. Espeland, E. A. R. Berg, P.-M. Jodoin, T. Grenier, C. Lartizien,
   J. D'hooge, L. Lovstakken and O. Bernard.
   *Deep Learning for Segmentation Using an Open Large-Scale Dataset in
   2D Echocardiography.*
   IEEE Transactions on Medical Imaging, vol. 38, no. 9, pp. 2198-2210, 2019.
   `DOI: 10.1109/TMI.2019.2900516 <https://doi.org/10.1109/TMI.2019.2900516>`_

.. rubric:: Links

* `Original dataset <https://humanheart-project.creatis.insa-lyon.fr/database/#collection/6373703d73e9f0047faa1bc8>`_
* `Dataset on Hugging Face <https://huggingface.co/datasets/zeahub/camus>`_

.. rubric:: Usage


.. code-block:: console

   python -m zea.data.convert camus ./raw ./output --download

For testing purposes, you can also convert a reduced dataset containing only 6 half-sequence files:

.. code-block:: console

    python -m zea.data.convert camus ./raw ./output --download --reduced-dataset

"""

from __future__ import annotations

from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import numpy as np
from tqdm import tqdm

from zea import log
from zea.beamform.pixelgrid import polar_pixel_grid
from zea.data.convert.utils import (
    check_output_dir_ownership,
    download_from_girder,
    require_output_dir_ownership,
    sitk_load,
    unzip,
    upload_dataset_to_hf,
    write_dataset_card,
)
from zea.data.file import File
from zea.func.tensor import translate

# Girder collection ID for the CAMUS dataset
_CAMUS_COLLECTION_ID = "6373703d73e9f0047faa1bc8"

# Segmentation label names — index 0 is the explicit 'unannotated' label.
# Frames that were not annotated will have only channel 0 set to True.
CAMUS_SEG_LABELS = np.array(["unannotated", "LV_endo", "LV_myo", "LA"], dtype=np.str_)

# ---------------------------------------------------------------------------
# Citation / license constants
# ---------------------------------------------------------------------------

CAMUS_CITATION = (
    "S. Leclerc, E. Smistad, J. Pedrosa, A. Ostvik, F. Cervenansky, F. Espinosa, "
    "T. Espeland, E. A. R. Berg, P.-M. Jodoin, T. Grenier, C. Lartizien, "
    "J. D'hooge, L. Lovstakken and O. Bernard. "
    '"Deep Learning for Segmentation Using an Open Large-Scale Dataset in '
    '2D Echocardiography." '
    "IEEE Transactions on Medical Imaging, vol. 38, no. 9, pp. 2198-2210, 2019. "
    "https://doi.org/10.1109/TMI.2019.2900516"
)

CAMUS_LICENSE = "CC BY-NC-SA 4.0 (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode)"

CAMUS_DESCRIPTION = (
    "CAMUS (Cardiac Acquisitions for Multi-structure Ultrasound Segmentation) "
    "2D echocardiographic dataset converted to zea format. "
    f"License: {CAMUS_LICENSE}. "
    f"Citation: {CAMUS_CITATION}"
)

# ---------------------------------------------------------------------------
# HuggingFace Hub
# ---------------------------------------------------------------------------


# Default HF repo for full dataset
_CAMUS_HF_REPO_ID = "zeahub/camus"
# HF repo for reduced/sample dataset
_CAMUS_SAMPLE_HF_REPO_ID = "zeahub/camus-sample"

# Hardcoded list of sample files for --reduced-dataset
_CAMUS_SAMPLE_FILES = [
    "train/patient0101/patient0101_2CH_half_sequence.hdf5",
    "train/patient0101/patient0101_4CH_half_sequence.hdf5",
    "val/patient0401/patient0401_2CH_half_sequence.hdf5",
    "val/patient0401/patient0401_4CH_half_sequence.hdf5",
    "test/patient0451/patient0451_2CH_half_sequence.hdf5",
    "test/patient0451/patient0451_4CH_half_sequence.hdf5",
]


def _parse_cfg(cfg_path: Path) -> dict:
    """Parse a CAMUS ``Info_*.cfg`` file into a plain dict.

    Each line has the form ``Key: value``.  Lines that cannot be parsed are
    silently ignored.

    Args:
        cfg_path: Path to the cfg file.

    Returns:
        Dictionary mapping field names to their raw string values.
    """
    result = {}
    for line in cfg_path.read_text().splitlines():
        if ":" in line:
            key, _, value = line.partition(":")
            result[key.strip()] = value.strip()
    return result


[docs] def process_camus(source_path, output_path, overwrite=False): """Convert one CAMUS NIfTI half-sequence into the zea HDF5 format. Stores the scan-converted B-mode sequence (``data/image``), per-pixel Cartesian coordinates derived from the NIfTI voxel spacing, the full segmentation sequence (``data/segmentation``) with an explicit ``"unannotated"`` label channel for frames that lack manual annotations, and rich clinical metadata parsed from the accompanying ``Info_*.cfg`` file. Args: source_path (str, pathlike): Path to a ``*_half_sequence.nii.gz`` file. output_path (str, pathlike): Destination HDF5 file path. overwrite (bool, optional): Overwrite existing output file. Defaults to False. """ source_path = Path(source_path) output_path = Path(output_path) if output_path.exists(): if overwrite: output_path.unlink() else: log.warning("Output file %s already exists. Skipping.", log.yellow(output_path)) return # ---- derive patient / view from filename -------------------------------- # source_path.name e.g. patient0001_2CH_half_sequence.nii.gz stem = source_path.name.removesuffix(".nii.gz") # patient0001_2CH_half_sequence parts = stem.split("_") # [patient0001, 2CH, half, sequence] patient_name = parts[0] # patient0001 view = parts[1] # 2CH | 4CH patient_dir = source_path.parent # ---- parse clinical metadata ------------------------------------------- cfg = _parse_cfg(patient_dir / f"Info_{view}.cfg") # ED / ES are 1-indexed in the cfg file ed_idx = int(cfg["ED"]) - 1 es_idx = int(cfg["ES"]) - 1 n_frames = int(cfg["NbFrame"]) sex = cfg.get("Sex", "").lower() # "f" | "m" age = int(cfg.get("Age", 0)) image_quality = cfg.get("ImageQuality", "") ef = cfg.get("EF", "") frame_rate = cfg.get("FrameRate", "") # ---- load image sequence ------------------------------------------------ image_seq, meta = sitk_load(source_path) # (n_frames, H, W), uint8 image_seq = translate( image_seq.astype(np.float32), (0, 255), (-60, 0) ) # convert to dB, float32 # ---- build pixel coordinates ------------------------------------------- # sitk GetSpacing() order: (x_lateral, y_depth, z_frame) in mm spacing = meta["spacing"] # (lateral_mm, depth_mm, 1.0) x_step = float(spacing[0]) / 1000 # metres per column z_step = float(spacing[1]) / 1000 # metres per row H, W = image_seq.shape[1], image_seq.shape[2] # x=0 at apex (centre column), z=0 at transducer surface — matches polar_pixel_grid convention cols = (np.arange(W, dtype=np.float32) - W / 2) * x_step rows = np.arange(H, dtype=np.float32) * z_step xx, zz = np.meshgrid(cols, rows) # each (H, W) coordinates = np.stack([xx, np.zeros_like(xx), zz], axis=-1).astype( np.float32 ) # (H, W, 3): [x_lateral, y=0, z_depth] # ---- polar image -------------------------------------------------------- # coordinates are frame-agnostic so we grab them from the last iteration polar_values, polar_coords = _build_polar_image(image_seq[0], x_step, z_step, H, W) # ---- load segmentation -------------------------------------------------- gt_path = patient_dir / f"{patient_name}_{view}_half_sequence_gt.nii.gz" gt_seq, _ = sitk_load(gt_path) # (n_frames, H, W), uint8; labels 0-3 # Build multi-label bool array with 4 channels: # 0 = unannotated (True for frames without manual labels) # 1 = LV_endo (label value 1 in the GT) # 2 = LV_myo (label value 2) # 3 = LA (label value 3) seg_values = np.zeros((n_frames, H, W, 4), dtype=np.bool_) annotated = np.zeros(n_frames, dtype=np.bool_) annotated[ed_idx] = True annotated[es_idx] = True seg_values[~annotated, :, :, 0] = True # unannotated channel for label_idx, gt_val in enumerate([1, 2, 3], start=1): seg_values[annotated, :, :, label_idx] = gt_seq[annotated] == gt_val # ---- frame-level labels ------------------------------------------------- frame_labels = np.array([""] * n_frames, dtype="<U2") frame_labels[ed_idx] = "ED" frame_labels[es_idx] = "ES" # ---- write HDF5 --------------------------------------------------------- text_report = f"EF: {ef}% FrameRate: {frame_rate} fps ImageQuality: {image_quality}" # ---- build full polar sequence by resampling each frame ---------------- polar_seq = np.stack( [polar_values] + [_build_polar_image(image_seq[i], x_step, z_step, H, W)[0] for i in range(1, n_frames)], axis=0, ) # (n_frames, n_r, n_theta) File.create( path=output_path, data={ "image": {"values": image_seq, "coordinates": coordinates}, "image_polar": {"values": polar_seq, "coordinates": polar_coords}, "segmentation": { "values": seg_values, "labels": CAMUS_SEG_LABELS, "coordinates": coordinates, }, }, probe={"name": "GE M5S"}, metadata={ "subject": { "id": patient_name, "type": "human", "sex": sex, "age": np.uint8(min(age, 255)), }, "credit": CAMUS_CITATION, "text_report": text_report, "annotations": { "view": np.array([view] * n_frames, dtype=np.str_), "label": frame_labels, "image_quality": image_quality, }, }, description=CAMUS_DESCRIPTION, )
def _build_polar_image( image_sc: np.ndarray, x_step: float, z_step: float, n_r: int, n_theta: int, ) -> tuple[np.ndarray, np.ndarray]: """Resample one scan-converted frame onto a polar (depth × angle) grid. Uses :func:`~zea.beamform.pixelgrid.polar_pixel_grid` to build the sampling grid, then maps back to pixel coordinates in the scan-converted image and interpolates with ``scipy.ndimage.map_coordinates``. The transducer apex is assumed to be at (x=0, z=0) — i.e. the top-centre of the scan-converted image, consistent with the x-centered Cartesian coordinates stored in ``image``. The sector half-angle and radius are inferred from the widest non-background row of the image rather than from the image dimensions, because the CAMUS scan-converted images are wider than the sector fan (the corners are background padding). Args: image_sc: Scan-converted frame, shape ``(H, W)``, float32 dB. x_step: Lateral pixel spacing in metres. z_step: Axial pixel spacing in metres. n_r: Number of radial (depth) samples in the output. n_theta: Number of angular samples in the output. Returns: Tuple of: - ``polar_values``: ``(n_r, n_theta)`` float32, polar-resampled image. - ``polar_coords``: ``(n_r, n_theta, 3)`` float32, Cartesian [x, 0, z] positions in metres for each polar pixel (x=0 at apex centre). """ from scipy.ndimage import map_coordinates H, W = image_sc.shape # Detect the actual sector half-angle and radius from the image content. # The scan-converted image is wider than the fan; the image corners are # background padding. The widest non-background row sits at the arc boundary # of the sector (r = R_max), giving the most accurate theta_max estimate. bg_val = float(image_sc.min()) fg = image_sc > bg_val + 0.5 row_widths = fg.sum(axis=1) widest_row = int(np.argmax(row_widths)) fg_cols = np.where(fg[widest_row])[0] if fg_cols.size >= 2: x_half_m = ((fg_cols[-1] - fg_cols[0]) / 2) * x_step z_at_widest = widest_row * z_step theta_max = float(np.arctan2(x_half_m, z_at_widest)) r_max = float(np.sqrt(x_half_m**2 + z_at_widest**2)) else: x_half_m = (W / 2) * x_step r_max = H * z_step theta_max = float(np.arctan2(x_half_m, r_max)) # polar_pixel_grid returns (n_r, n_theta, 3) Cartesian [x, y, z] with x=0 at apex polar_coords = polar_pixel_grid( polar_limits=(-theta_max, theta_max), zlims=(0.0, r_max), num_radial_pixels=n_r, num_polar_pixels=n_theta, distance_to_apex=0.0, ).astype(np.float32) # (n_r, n_theta, 3) x_polar = polar_coords[:, :, 0] # (n_r, n_theta), x=0 at apex centre z_polar = polar_coords[:, :, 2] # (n_r, n_theta) # Map Cartesian coords back to pixel positions (col = (x + W/2*x_step)/x_step, row = z/z_step) col_coords = (x_polar + (W / 2) * x_step) / x_step row_coords = z_polar / z_step polar_values = map_coordinates( image_sc, [row_coords, col_coords], order=1, mode="constant", cval=float(image_sc.min()), ).astype(np.float32) return polar_values, polar_coords splits = {"train": [1, 401], "val": [401, 451], "test": [451, 501]}
[docs] def get_split(patient_id: int) -> str: """Determine which dataset split a patient ID belongs to. Args: patient_id: Integer ID of the patient. Returns: The split name: "train", "val", or "test". Raises: ValueError: If the patient_id does not fall into any defined split range. """ if splits["train"][0] <= patient_id < splits["train"][1]: return "train" elif splits["val"][0] <= patient_id < splits["val"][1]: return "val" elif splits["test"][0] <= patient_id < splits["test"][1]: return "test" else: raise ValueError(f"Did not find split for patient: {patient_id}")
def _process_task(task): """Unpack a task tuple and invoke process_camus in a worker process. Creates parent directories for the target outputs, calls process_camus with the unpacked paths, and logs then re-raises any exception raised by processing. Args: task (tuple): (source_file_str, output_file_str) - source_file_str: filesystem path to the source CAMUS file as a string. - output_file_str: filesystem path for the ZEA output file as a string. """ source_file_str, output_file_str = task source_file = Path(source_file_str) output_file = Path(output_file_str) # Ensure destination directories exist (safe to call from multiple processes) output_file.parent.mkdir(parents=True, exist_ok=True) # Call the real processing function (must be importable in the worker) # If process_camus lives in another module, import it there instead. try: process_camus(source_file, output_file, overwrite=False) except Exception: log.error("Error processing %s", log.yellow(source_file)) raise
[docs] def download_camus( # pragma: no cover destination: str | Path, patients: list[int] | None = None ) -> Path: """Download the CAMUS dataset from the Girder server. Downloads NIfTI files for each patient. Args: destination: Directory where the dataset will be downloaded. patients: List of patient IDs to download (1-500). If None, all patients are downloaded. Returns: Path to the downloaded dataset directory. """ return download_from_girder( collection_id=_CAMUS_COLLECTION_ID, destination=destination, dataset_name="CAMUS", patients=patients, top_folder_name="database_nifti", )
[docs] def convert_camus(args): """Convert the CAMUS dataset into zea HDF5 files across dataset splits. Processes files found under the CAMUS source folder (after unzipping or downloading if needed), assigns each patient to a train/val/test split, creates matching output paths, and executes per-file conversion tasks either serially or in parallel. Usage:: python -m zea.data.convert camus <source_folder> <destination_folder> python -m zea.data.convert camus <source_folder> <destination_folder> --download Args: args (argparse.Namespace): An object with attributes: - src (str | Path): Path to the CAMUS archive or extracted folder, or a directory to download into when ``--download`` is set. - dst (str | Path): Root destination folder for ZEA HDF5 outputs; split subfolders will be created. - download (bool, optional): If True, download the dataset first from the Girder server. - no_hyperthreading (bool, optional): If True, run tasks serially instead of using a process pool. """ camus_source_folder = Path(args.src) camus_output_folder = Path(args.dst) # Use sample repo if reduced-dataset flag is set is_reduced = getattr(args, "reduced_dataset", False) hf_repo_id = _CAMUS_SAMPLE_HF_REPO_ID if is_reduced else _CAMUS_HF_REPO_ID check_output_dir_ownership(camus_output_folder, hf_repo_id) # Optionally download the dataset if getattr(args, "download", False): camus_source_folder = download_camus(camus_source_folder) elif not camus_source_folder.exists(): raise FileNotFoundError( f"Source folder does not exist: {camus_source_folder}. " "Use --download to download the CAMUS dataset automatically." ) else: # Look for either CAMUS_public.zip or folders database_nifti, database_split camus_source_folder = unzip(camus_source_folder, "camus") # check if output folders already exist for split in splits: split_dir = camus_output_folder / split if split_dir.exists(): log.warning( "Output folder %s already exists. Existing files will be skipped.", log.yellow(split_dir), ) # clone folder structure of source to output using pathlib tasks = [] files = [] if is_reduced: # Only process the hardcoded sample files for rel_path in _CAMUS_SAMPLE_FILES: split, patient, fname = rel_path.split("/") # Raw CAMUS source has no split subdirectory — patient folders sit # directly under camus_source_folder (e.g. raw-camus/patient0101/). nii_fname = fname.replace(".hdf5", ".nii.gz") source_file = camus_source_folder / patient / nii_fname output_file = camus_output_folder / split / patient / fname output_file.parent.mkdir(parents=True, exist_ok=True) tasks.append((str(source_file), str(output_file))) files.append(source_file) else: files = sorted(camus_source_folder.glob("**/*_half_sequence.nii.gz")) for source_file in files: patient = source_file.name.removesuffix(".nii.gz").split("_")[0] patient_id = int(patient.removeprefix("patient")) split = get_split(patient_id) output_file = camus_output_folder / split / source_file.relative_to(camus_source_folder) output_file = output_file.with_suffix("").with_suffix(".hdf5") output_file.parent.mkdir(parents=True, exist_ok=True) tasks.append((str(source_file), str(output_file))) if not tasks: log.info("No files found to process.") return if getattr(args, "no_hyperthreading", False): log.info("no_hyperthreading is True — running tasks serially (no ProcessPoolExecutor)") for t in tqdm(tasks, desc="Processing files (serial)"): try: _process_task(t) except Exception as e: log.error("Task processing failed: %s", e) log.info( "Conversion complete. %d files written to %s", len(tasks), log.yellow(camus_output_folder), ) _copy_license(files, camus_output_folder) write_dataset_card(camus_output_folder, _CAMUS_DATASET_CARD) if getattr(args, "upload", False): upload_camus(camus_output_folder, revision=args.revision) return # Submit tasks to the process pool and track progress with ProcessPoolExecutor() as exe: for _ in tqdm(exe.map(_process_task, tasks), total=len(tasks), desc="Processing files"): pass log.info( "Conversion complete. %d files written to %s", len(tasks), log.yellow(camus_output_folder), ) _copy_license(files, camus_output_folder) # Write special dataset card if reduced if is_reduced: write_dataset_card(camus_output_folder, _make_camus_sample_dataset_card()) else: write_dataset_card(camus_output_folder, _CAMUS_DATASET_CARD) if getattr(args, "upload", False): upload_camus(camus_output_folder, revision=args.revision, repo_id=hf_repo_id)
def _copy_license(files: list[Path], output_folder: Path) -> None: """Copy ``MANDATORY_CITATION.md`` from the first patient directory to *output_folder*.""" import shutil for f in files: candidate = f.parent / "MANDATORY_CITATION.md" if candidate.exists(): shutil.copy2(candidate, output_folder / "MANDATORY_CITATION.md") log.info("Copied %s to %s", candidate.name, log.yellow(output_folder)) return log.warning("MANDATORY_CITATION.md not found in any patient directory.")
[docs] def upload_camus( # pragma: no cover output_folder: str | Path, revision: str, repo_id: str = _CAMUS_HF_REPO_ID ) -> None: """Upload the converted CAMUS dataset to a HuggingFace Hub revision branch. Only for zea maintainers with push access to the repository. Upload to ``main`` is blocked; merge the revision branch into ``main`` manually after verifying the upload. Args: output_folder: Root folder containing the train/val/test splits. revision: Target branch name on the Hub (must not be ``"main"``). """ require_output_dir_ownership(output_folder, repo_id) upload_dataset_to_hf( folder=output_folder, repo_id=repo_id, revision=revision, commit_message=f"Upload CAMUS dataset (zea format) to {revision}", )
_CAMUS_DATASET_CARD = ( """\ --- license: cc-by-nc-sa-4.0 zea_repo_id: zeahub/camus task_categories: - image-segmentation tags: - ultrasound - echocardiography - 2d - cardiac - medical pretty_name: "CAMUS: Cardiac Acquisitions for Multi-structure Ultrasound Segmentation" size_categories: - 1K<n<10K --- # CAMUS - 2-D Echocardiographic Ultrasound Dataset This dataset is a **zea-format** (HDF5) conversion of the [CAMUS](https://humanheart-project.creatis.insa-lyon.fr/database/#collection/6373703d73e9f0047faa1bc8) dataset for multi-structure segmentation in 2-D echocardiography. | Property | Value | |---|---| | **Modality** | 2-D transthoracic echocardiography | | **Patients** | 500 | | **Views** | 2-chamber (2CH) and 4-chamber (4CH) apical | | **Splits** | train (1-400), val (401-450), test (451-500) | ## Conversion This dataset was downloaded, converted to zea format, and uploaded using the [zea](https://github.com/tue-bmd/zea) data converter: ```bash python -m zea.data.convert camus <src> <dst> --download ``` ## Dataset structure ``` train/ patient0001/ patient0001_2CH_half_sequence.hdf5 patient0001_4CH_half_sequence.hdf5 ... val/ patient0401/ ... test/ patient0451/ ... ``` Each HDF5 file follows the [zea data format](https://github.com/tue-bmd/zea) and contains: - `data/image/values` — scan-converted B-mode sequence in dB, shape `(n_frames, H, W)`, float32; x=0 at apex centre - `data/image/coordinates` — per-pixel Cartesian positions in metres, shape `(H, W, 3)` [x, y=0, z] - `data/image_polar/values` — polar-resampled B-mode sequence, shape `(n_frames, n_r, n_theta)`, float32 - `data/image_polar/coordinates` — Cartesian [x, 0, z] positions of polar pixels in metres, shape `(n_r, n_theta, 3)` - `data/segmentation/values` — multi-label bool segmentation, shape `(n_frames, H, W, 4)` - `data/segmentation/labels` — `["unannotated", "LV_endo", "LV_myo", "LA"]`; unannotated frames have only the first channel set - `data/segmentation/coordinates` — same grid as `image/coordinates` - `metadata/subject` — patient ID, sex, age - `metadata/credit` — full citation string - `metadata/text_report` — ejection fraction, frame rate, image quality - `metadata/annotations/view` — `"2CH"` or `"4CH"` repeated for all frames - `metadata/annotations/label` — `"ED"` / `"ES"` for the corresponding frames, `""` otherwise - `metadata/annotations/image_quality` — `"Good"` / `"Medium"` / `"Poor"` ## License """ + CAMUS_LICENSE + """ The CAMUS dataset is available free of charge strictly for **non-commercial scientific research purposes only**. ## Citation If you use this dataset, please cite: ```bibtex @article{leclerc2019deep, title = {Deep Learning for Segmentation Using an Open Large-Scale Dataset in 2D Echocardiography}, author = {Leclerc, Sarah and Smistad, Erik and Pedrosa, Joao and Ostvik, Andreas and Cervenansky, Frederic and Espinosa, Florian and Espeland, Torvald and Berg, Erik Andreas Rye and Jodoin, Pierre-Marc and Grenier, Thomas and Lartizien, Carole and D'hooge, Jan and Lovstakken, Lasse and Bernard, Olivier}, journal = {IEEE Transactions on Medical Imaging}, volume = {38}, number = {9}, pages = {2198--2210}, year = {2019}, doi = {10.1109/TMI.2019.2900516} } ``` ## Links - **Original dataset**: <https://humanheart-project.creatis.insa-lyon.fr/database/#collection/6373703d73e9f0047faa1bc8> - **zea toolkit**: <https://github.com/tue-bmd/zea> """ ) def _make_camus_sample_dataset_card() -> str: """Build the dataset card for the reduced sample subset from the full card. Derives the sample card from ``_CAMUS_DATASET_CARD`` by updating the YAML frontmatter fields that differ and prepending a notice that this is a sample subset. Returns: The dataset card string for the sample subset. """ card = _CAMUS_DATASET_CARD card = card.replace( "zea_repo_id: zeahub/camus", "zea_repo_id: zeahub/camus-sample", ) card = card.replace( 'pretty_name: "CAMUS: Cardiac Acquisitions for Multi-structure Ultrasound Segmentation"', 'pretty_name: "CAMUS Sample: Cardiac Acquisitions for Multi-structure ' 'Ultrasound Segmentation (Sample)"', ) card = card.replace( "size_categories:\n - 1K<n<10K", "size_categories:\n - n<10", ) card = card.replace( "# CAMUS - 2-D Echocardiographic Ultrasound Dataset", "# CAMUS Sample - 2-D Echocardiographic Ultrasound Dataset", ) sample_notice = ( "\n> **This is a sample subset** of the full CAMUS dataset, provided for " "demonstration and testing purposes. It contains 6 files (1 patient per split). " "For the full dataset (500 patients), see: " "[zeahub/camus](https://huggingface.co/datasets/zeahub/camus).\n" ) card = card.replace( "\n\nThis dataset is a **zea-format**", sample_notice + "\nThis dataset is a **zea-format**", ) return card