"""
Dataset discovery module.
This module provides a class-based approach to discover datasets using regex
patterns.
All filtering and transformations are handled through regex substitutions.
"""
import logging
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union
[docs]
@dataclass
class DatasetInfo:
"""Container for dataset information."""
original_name: str
transformed_name: str
tiff_files: Dict[int, List[str]] # session_idx -> list of files
subject_metadata: str
session_metadata: Dict[int, str] # session_idx -> metadata string
[docs]
class DatasetDiscoverer:
"""
A class for discovering and organizing datasets with TIFF files.
This class handles both NeuroBlueprint format and custom format datasets,
providing methods to discover datasets, extract metadata, and organize
TIFF files by sessions.
"""
def __init__(
self,
base_path: Union[str, Path],
pattern: str = ".*",
exclude_datasets: Optional[List[str]] = None,
exclude_sessions: Optional[List[str]] = None,
tiff_patterns: Optional[List[str]] = None,
neuroblueprint_format: bool = False,
):
"""
Initialize the dataset discoverer.
Parameters
----------
base_path : str or Path
Base path to search for datasets.
pattern : str, optional
Regex pattern to match dataset names, defaults to ".*"
(all directories). Only used when neuroblueprint_format is False.
exclude_datasets : List[str], optional
List of regex patterns for dataset folder names to exclude.
Applied to both NeuroBlueprint and custom discovery.
exclude_sessions : List[str], optional
List of regex patterns for session folder names to exclude.
Applied when scanning session folders for NeuroBlueprint or
custom session folders.
tiff_patterns : List[str], optional
List of glob patterns for TIFF files. Each pattern corresponds to a
session. Defaults to ["*.tif"] for a single session.
neuroblueprint_format : bool, optional
If True, validates and uses NeuroBlueprint format
(sub-XXX_key-value/ses-XXX_key-value/)
with automatic metadata extraction. If False or if validation
fails, transforms folder names to NeuroBlueprint compliant
format. Defaults to False.
"""
self.base_path = Path(base_path)
self.pattern = pattern
# Patterns to exclude dataset folder names (regex)
self.exclude_datasets = exclude_datasets or []
# Patterns to exclude session folder names (regex)
self.exclude_sessions = exclude_sessions or []
self.tiff_patterns = tiff_patterns or ["*.tif"]
self.neuroblueprint_format = neuroblueprint_format
# Will be populated by discover()
self.datasets: List[DatasetInfo] = []
self._all_tiff_files: List[str] = []
@property
def original_datasets(self) -> List[str]:
"""Get list of original dataset names."""
return [ds.original_name for ds in self.datasets]
@property
def transformed_datasets(self) -> List[str]:
"""Get list of transformed dataset names."""
return [ds.transformed_name for ds in self.datasets]
@property
def tiff_files(self) -> Dict[str, Dict[int, List[str]]]:
"""Get TIFF files organized by original dataset name and session."""
return {ds.original_name: ds.tiff_files for ds in self.datasets}
@property
def tiff_files_flat(self) -> List[str]:
"""Get flat list of all TIFF files."""
return self._all_tiff_files.copy()
@property
def subject_metadata(self) -> Dict[str, str]:
"""Get subject metadata by original dataset name."""
return {ds.original_name: ds.subject_metadata for ds in self.datasets}
@property
def session_metadata(self) -> Dict[str, Dict[int, str]]:
"""Get session metadata by original dataset name and session."""
return {ds.original_name: ds.session_metadata for ds in self.datasets}
[docs]
def get_session_name(self, dataset_idx: int, session_idx: int) -> str:
"""
Get session name for given dataset and session indices.
Parameters
----------
dataset_idx : int
Index of the dataset in the discovered datasets list
session_idx : int
Index of the session within the dataset
Returns
-------
str
Formatted session name like "ses-0_metadata" or
"ses-1_date-20250225"
"""
if dataset_idx >= len(self.datasets):
raise IndexError(
f"Dataset index {dataset_idx} out of range "
f"(0-{len(self.datasets)-1})"
)
dataset = self.datasets[dataset_idx]
session_meta = dataset.session_metadata.get(session_idx, "")
# Format: ses-{session_idx}_{metadata}
# session_idx here is actually the original session ID from the
# folder name
if session_meta:
return f"ses-{session_idx:03d}_{session_meta}"
else:
return f"ses-{session_idx:03d}"
@staticmethod
def _extract_session_id_from_folder_name(folder_name: str) -> str:
"""
Extract session ID from neuroblueprint session folder name.
Parameters
----------
folder_name : str
Session folder name like 'ses-003_date-20250301_protocol-baseline'
Returns
-------
str
Session ID like '003'
"""
# Extract the ID part after 'ses-' and
# before first '_' or end of string
match = re.match(r"ses-([^_]+)", folder_name)
if match:
return match.group(1)
return "0" # fallback
@staticmethod
def _infer_metadata_keys_from_folder_names(
folder_names: List[str],
) -> Dict[str, str]:
"""
Automatically infer metadata keys and patterns from actual folder
names.
Parameters
----------
folder_names : list
List of folder names to analyze for metadata patterns
Returns
-------
dict
Dictionary of inferred metadata keys and regex patterns
"""
metadata_patterns = {}
for folder_name in folder_names:
# Split by underscore and look for key-value patterns
parts = folder_name.split("_")
for part in parts:
# Look for patterns like "key-value"
if "-" in part:
key, value = part.split("-", 1)
# Skip 'sub', 'ses', and 'session' as they are structural,
# not metadata
if key not in ["sub", "ses", "session"]:
# Create a flexible regex pattern for this key
metadata_patterns[key] = f"{key}-([^_]+)"
return metadata_patterns
@staticmethod
def _is_neuroblueprint_format(
folder_name: str, expected_prefix: str
) -> bool:
"""
Check if a folder name follows NeuroBlueprint format.
Parameters
----------
folder_name : str
Name of the folder to check
expected_prefix : str
Expected prefix ("sub" or "ses")
Returns
-------
bool
True if folder follows NeuroBlueprint format, False otherwise
"""
# Check if it starts with the expected prefix followed by a dash
if not folder_name.startswith(f"{expected_prefix}-"):
return False
# Split by underscores and check each part
parts = folder_name.split("_")
# First part should be prefix-identifier
first_part = parts[0]
if not re.match(rf"{expected_prefix}-[a-zA-Z0-9]+", first_part):
return False
# Remaining parts should be key-value pairs
for part in parts[1:]:
if not re.match(r"[a-zA-Z][a-zA-Z0-9]*-[a-zA-Z0-9]+", part):
return False
return True
@staticmethod
def _extract_metadata_from_name(
folder_name: str, metadata_extraction: Optional[Dict[str, str]] = None
) -> str:
"""
Extract metadata from folder name and format as key-value pairs.
Parameters
----------
folder_name : str
Name of the folder to extract metadata from
metadata_extraction : dict, optional
Dictionary of metadata keys and regex patterns.
If None, will infer from folder name.
Returns
-------
str
Formatted metadata string like "date-20250225_protocol-training"
"""
# If no metadata extraction patterns provided, infer from folder name
if not metadata_extraction:
metadata_extraction = (
DatasetDiscoverer._infer_metadata_keys_from_folder_names(
[folder_name]
)
)
if not metadata_extraction:
return ""
metadata_pairs = []
for key, pattern in metadata_extraction.items():
match = re.search(pattern, folder_name)
if match:
value = match.group(1)
metadata_pairs.append(f"{key}-{value}")
return "_".join(metadata_pairs)
[docs]
def discover(self) -> None:
"""
Discover datasets and their TIFF files in the directory.
This method populates the datasets list and all related metadata.
After calling this method, you can access the results through the
class properties.
"""
# Clear any existing data
self.datasets.clear()
self._all_tiff_files.clear()
# Discover and transform dataset names
original_datasets, transformed_datasets = (
self._discover_dataset_folders()
)
# Process each dataset to extract TIFF files and metadata
for orig_name, trans_name in zip(
original_datasets, transformed_datasets
):
dataset_info = self._process_dataset(orig_name, trans_name)
if dataset_info:
self.datasets.append(dataset_info)
def _discover_dataset_folders(self) -> tuple[List[str], List[str]]:
"""
Discover dataset folders and return original and transformed names.
Returns
-------
tuple[List[str], List[str]]
Tuple of (original_datasets, transformed_datasets)
"""
original_datasets: List[str] = []
transformed_datasets: List[str] = []
if self.neuroblueprint_format:
original_datasets, transformed_datasets = (
self._discover_neuroblueprint_datasets()
)
# If no valid NeuroBlueprint folders found,
# fallback to custom format
if not original_datasets:
logging.info(
"No valid NeuroBlueprint format folders found in "
f"{self.base_path}. "
"Falling back to custom format processing."
)
self.neuroblueprint_format = False
if not self.neuroblueprint_format:
original_datasets, transformed_datasets = (
self._discover_custom_datasets()
)
# Sort the datasets as per original names to ensure consistent order
sorted_indices = sorted(
range(len(original_datasets)), key=lambda i: original_datasets[i]
)
original_datasets = [original_datasets[i] for i in sorted_indices]
transformed_datasets = [
transformed_datasets[i] for i in sorted_indices
]
return original_datasets, transformed_datasets
def _discover_neuroblueprint_datasets(self) -> tuple[List[str], List[str]]:
"""
Discover datasets in NeuroBlueprint format.
Returns
-------
tuple[List[str], List[str]]
Tuple of (original_datasets, transformed_datasets)
"""
# For NeuroBlueprint format, look for sub-XXX
# directories and validate format
all_sub_folders = [
d.name
for d in self.base_path.iterdir()
if d.is_dir() and d.name.startswith("sub-")
]
candidate_datasets = [
folder
for folder in all_sub_folders
if self._is_neuroblueprint_format(folder, "sub")
]
# Apply dataset exclusions if any
if self.exclude_datasets:
candidate_datasets = [
ds
for ds in candidate_datasets
if not any(re.match(pat, ds) for pat in self.exclude_datasets)
]
# Names are already compliant, so original and
# transformed are identical
return candidate_datasets, candidate_datasets
def _discover_custom_datasets(self) -> tuple[List[str], List[str]]:
"""
Discover datasets in custom format and transform to
NeuroBlueprint compliant names.
Returns
-------
tuple[List[str], List[str]]
Tuple of (original_datasets, transformed_datasets)
"""
# For custom format, use pattern matching
candidate_datasets = [
d.name
for d in self.base_path.iterdir()
if d.is_dir() and re.match(self.pattern, d.name)
]
# Apply dataset exclusions if any
if self.exclude_datasets:
candidate_datasets = [
ds
for ds in candidate_datasets
if not any(re.match(pat, ds) for pat in self.exclude_datasets)
]
original_datasets = candidate_datasets.copy()
# No substitutions: use the discovered dataset names directly
working_datasets = candidate_datasets.copy()
# Transform to NeuroBlueprint compliant format
transformed_datasets = []
for i, ds in enumerate(working_datasets):
# Check if the dataset name already contains key-value pairs
# (indicated by underscore-hyphen patterns like "key-value")
if "_" in ds and "-" in ds:
# Already has key-value structure, append directly
transformed_name = f"sub-{i+1:03d}_{ds}"
else:
# Use counter as subject ID and original name as metadata
# with id- prefix
transformed_name = f"sub-{i+1:03d}_id-{ds}"
transformed_datasets.append(transformed_name)
return original_datasets, transformed_datasets
def _process_dataset(
self, orig_name: str, trans_name: str
) -> Optional[DatasetInfo]:
"""
Process a single dataset to extract TIFF files and metadata.
Parameters
----------
orig_name : str
Original dataset folder name
trans_name : str
Transformed NeuroBlueprint compliant name
Returns
-------
Optional[DatasetInfo]
DatasetInfo object if dataset has TIFF files, None otherwise
"""
dataset_path = self.base_path / orig_name
# Check if there is at least one tiff in
# the dataset using configured patterns
has_tiff_files = any(
dataset_path.rglob(pattern) for pattern in self.tiff_patterns
)
if not has_tiff_files:
logging.info(
f"No tiff files found in {dataset_path} "
f"matching patterns {self.tiff_patterns}"
)
return None
# Extract subject metadata
subject_meta, inferred_metadata = self._extract_subject_metadata(
orig_name, dataset_path
)
# Extract TIFF files and session metadata
tiff_files_by_session, session_meta_by_session = (
self._extract_tiff_files_and_metadata(
dataset_path, inferred_metadata
)
)
# Create DatasetInfo object
return DatasetInfo(
original_name=orig_name,
transformed_name=trans_name,
tiff_files=tiff_files_by_session,
subject_metadata=subject_meta,
session_metadata=session_meta_by_session,
)
def _extract_subject_metadata(
self, orig_name: str, dataset_path: Path
) -> tuple[str, Dict[str, str]]:
"""
Extract subject metadata from dataset folder name.
Parameters
----------
orig_name : str
Original dataset folder name
dataset_path : Path
Path to the dataset folder
Returns
-------
tuple[str, Dict[str, str]]
Tuple of (subject_metadata, inferred_metadata_patterns)
"""
if self.neuroblueprint_format:
# Auto-infer metadata patterns from all folder names in dataset
all_folder_names = [orig_name]
if dataset_path.is_dir():
session_folders = [
d.name
for d in dataset_path.iterdir()
if d.is_dir()
and self._is_neuroblueprint_format(d.name, "ses")
and not any(
re.match(pat, d.name) for pat in self.exclude_sessions
)
]
all_folder_names.extend(session_folders)
inferred_metadata = self._infer_metadata_keys_from_folder_names(
all_folder_names
)
subject_meta = self._extract_metadata_from_name(
orig_name, inferred_metadata
)
return subject_meta, inferred_metadata
else:
# For custom format, no metadata extraction
return "", {}
def _extract_tiff_files_and_metadata(
self, dataset_path: Path, inferred_metadata: Dict[str, str]
) -> tuple[Dict[int, List[str]], Dict[int, str]]:
"""
Extract TIFF files and session metadata for a dataset.
Parameters
----------
dataset_path : Path
Path to the dataset folder
inferred_metadata : Dict[str, str]
Inferred metadata patterns for extraction
Returns
-------
tuple[Dict[int, List[str]], Dict[int, str]]
Tuple of (tiff_files_by_session, session_metadata_by_session)
"""
tiff_files_by_session: Dict[int, List[str]] = {}
session_meta_by_session: Dict[int, str] = {}
# Always check for NeuroBlueprint session folders first,
# regardless of global neuroblueprint_format setting
has_neuroblueprint_sessions = any(
d.is_dir() and self._is_neuroblueprint_format(d.name, "ses")
for d in dataset_path.iterdir()
)
if has_neuroblueprint_sessions:
# Use NeuroBlueprint session processing
tiff_files_by_session, session_meta_by_session = (
self._extract_neuroblueprint_files(
dataset_path, inferred_metadata
)
)
logging.debug(
f"Found NeuroBlueprint sessions in {dataset_path}, "
f"extracted {len(tiff_files_by_session)} sessions"
)
else:
# Fall back to custom format processing
tiff_files_by_session, session_meta_by_session = (
self._extract_custom_files(dataset_path)
)
logging.debug(
f"No NeuroBlueprint sessions in {dataset_path}, "
f"using custom format processing"
)
return tiff_files_by_session, session_meta_by_session
def _extract_neuroblueprint_files(
self, dataset_path: Path, inferred_metadata: Dict[str, str]
) -> tuple[Dict[int, List[str]], Dict[int, str]]:
"""
Extract TIFF files from NeuroBlueprint format dataset.
Parameters
----------
dataset_path : Path
Path to the dataset folder
inferred_metadata : Dict[str, str]
Inferred metadata patterns for extraction
Returns
-------
tuple[Dict[int, List[str]], Dict[int, str]]
Tuple of (tiff_files_by_session, session_metadata_by_session)
"""
tiff_files_by_session = {}
session_meta_by_session = {}
# For NeuroBlueprint format, look for ses-XXX folders that are valid
session_folders = sorted(
[
d
for d in dataset_path.iterdir()
if d.is_dir()
and self._is_neuroblueprint_format(d.name, "ses")
and not any(
re.match(pat, d.name) for pat in self.exclude_sessions
)
]
)
logging.debug(
f"Found session folders in {dataset_path}: "
f"{[s.name for s in session_folders]}"
)
# Map each session folder to its actual session ID and extract files
for session_folder in session_folders:
# Extract the actual session ID from folder name
session_id = self._extract_session_id_from_folder_name(
session_folder.name
)
# Try each TIFF pattern to find files in this session
for tiff_pattern in self.tiff_patterns:
files_in_session = sorted(
[
f.name
for f in session_folder.rglob(tiff_pattern)
if f.is_file()
]
)
if files_in_session:
# Extract metadata from session folder name
session_meta = self._extract_metadata_from_name(
session_folder.name, inferred_metadata
)
# Use the actual session ID as key, not enumerate index
tiff_files_by_session[int(session_id)] = files_in_session
session_meta_by_session[int(session_id)] = session_meta
self._all_tiff_files.extend(files_in_session)
logging.debug(
f"Session {session_id} matched "
f"pattern {tiff_pattern} "
f"in {session_folder.name} with "
f"metadata: {session_meta}"
)
break
if not tiff_files_by_session:
logging.info(
f"No files found for patterns {self.tiff_patterns} in "
f"session folders of {dataset_path}"
)
return tiff_files_by_session, session_meta_by_session
def _extract_custom_files(
self, dataset_path: Path
) -> tuple[Dict[int, List[str]], Dict[int, str]]:
"""
Extract TIFF files from custom format dataset.
Parameters
----------
dataset_path : Path
Path to the dataset folder
Returns
-------
tuple[Dict[int, List[str]], Dict[int, str]]
Tuple of (tiff_files_by_session, session_metadata_by_session)
"""
tiff_files_by_session = {}
session_meta_by_session = {}
# Check if this dataset contains NeuroBlueprint session folders
session_folders = [
d
for d in dataset_path.iterdir()
if d.is_dir()
and self._is_neuroblueprint_format(d.name, "ses")
and not any(re.match(pat, d.name) for pat in self.exclude_sessions)
]
if session_folders:
# Hybrid mode: custom subject folder with NeuroBlueprint sessions
logging.info(
f"Detected NeuroBlueprint session folders in custom "
f"dataset {dataset_path.name}: "
f"{[s.name for s in session_folders]}"
)
# Infer metadata patterns from session folder names
inferred_metadata = self._infer_metadata_keys_from_folder_names(
[s.name for s in session_folders]
)
# Process each session folder similar to NeuroBlueprint mode
for session_folder in sorted(session_folders):
session_id = self._extract_session_id_from_folder_name(
session_folder.name
)
# Try each TIFF pattern to find files in this session
for tiff_pattern in self.tiff_patterns:
files_in_session = sorted(
[
f.name
for f in session_folder.rglob(tiff_pattern)
if f.is_file()
]
)
if files_in_session:
# Extract metadata from session folder name
session_meta = self._extract_metadata_from_name(
session_folder.name, inferred_metadata
)
# Use the actual session ID as key
tiff_files_by_session[int(session_id)] = (
files_in_session
)
session_meta_by_session[int(session_id)] = session_meta
self._all_tiff_files.extend(files_in_session)
logging.debug(
f"Session {session_id} matched pattern "
f"{tiff_pattern} "
f"in {session_folder.name} with metadata: "
f"{session_meta}"
)
break
else:
# Check for custom session folders (not NeuroBlueprint but still
# organized)
custom_session_folders = [
d
for d in dataset_path.iterdir()
if d.is_dir()
and not any(
re.match(pat, d.name) for pat in self.exclude_sessions
)
]
if custom_session_folders:
# Custom session folders exist - extract metadata from them
logging.info(
f"Detected custom session folders in "
f"dataset {dataset_path.name}: "
f"{[s.name for s in custom_session_folders]}"
)
# Infer metadata patterns from session folder names
inferred_metadata = (
self._infer_metadata_keys_from_folder_names(
[s.name for s in custom_session_folders]
)
)
# Process each session folder
for session_folder in sorted(custom_session_folders):
# Extract session ID from folder name (look for
# session-XXX pattern)
session_id_match = re.search(
r"session-(\d+)", session_folder.name
)
if session_id_match:
custom_session_id = int(session_id_match.group(1))
else:
# If no session-XXX pattern, use enumerate starting
# from 1
custom_session_id = len(tiff_files_by_session) + 1
# Try each TIFF pattern to find files in this session
for tiff_pattern in self.tiff_patterns:
files_in_session = sorted(
[
f.name
for f in session_folder.rglob(tiff_pattern)
if f.is_file()
]
)
if files_in_session:
# Extract metadata from session folder name
session_meta = self._extract_metadata_from_name(
session_folder.name, inferred_metadata
)
# If no metadata could be inferred from a custom
# session folder (e.g. plain alphanumeric names
# like 'novelEnv07'), preserve the raw folder name
# as id-<name> so downstream code can see the
# original identifier.
if not session_meta:
session_meta = f"id-{session_folder.name}"
tiff_files_by_session[custom_session_id] = (
files_in_session
)
session_meta_by_session[custom_session_id] = (
session_meta
)
self._all_tiff_files.extend(files_in_session)
logging.debug(
f"Session {custom_session_id} matched pattern "
f"{tiff_pattern} in {session_folder.name} "
f"with metadata: {session_meta}"
)
break
else:
# Pure custom format: search directly in dataset folder
for session_idx, tiff_pattern in enumerate(
self.tiff_patterns, start=1
):
files_found = sorted(
[
f.name
for f in dataset_path.rglob(tiff_pattern)
if f.is_file()
]
)
tiff_files_by_session[session_idx] = files_found
if files_found:
self._all_tiff_files.extend(files_found)
# No session metadata for pure custom format
session_meta_by_session[session_idx] = ""
return tiff_files_by_session, session_meta_by_session