Source code for photon_mosaic.dataset_discovery
"""
Dataset discovery module.
This module provides functions to discover datasets using regex patterns.
All filtering and transformations are handled through regex substitutions.
"""
import logging
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
[docs]
def discover_datasets(
base_path: Union[str, Path],
pattern: str = ".*",
exclude_patterns: Optional[List[str]] = None,
substitutions: Optional[List[Dict[str, str]]] = None,
tiff_patterns: list = ["*.tif"],
) -> Tuple[List[str], List[str], Dict[str, Dict[int, List[str]]], List[str]]:
"""
Discover datasets and their TIFF files in a directory using regex patterns.
Parameters
----------
base_path : str or Path
Base path to search for datasets.
pattern : str, optional
Regex pattern to match dataset names, defaults to ".*"
(all directories).
exclude_patterns : List[str], optional
List of regex patterns for datasets to exclude.
substitutions : List[Dict[str, str]], optional
List of regex substitution pairs to transform dataset names.
Each dict should have 'pattern' and 'repl' keys for re.sub().
tiff_patterns : list, optional
List of glob patterns for TIFF files. Each pattern corresponds to a
session. Defaults to ["*.tif"] for a single session.
Returns
-------
Tuple[List[str], List[str], Dict[str, Dict[int, List[str]]], List[str]]
- List of original dataset names (sorted)
- List of transformed dataset names (sorted)
- Dictionary mapping original dataset names to their TIFF files by
session (session index as key)
- List of all TIFF files found across all datasets
Notes
-----
- Datasets without any TIFF files are automatically excluded from the
results
- Both original and transformed dataset lists are sorted alphabetically
- Sessions are numbered starting from 0 based on the order in tiff_patterns
- Empty sessions (no files found) are included with empty lists
"""
# Convert base_path to Path if it's a string
base_path_obj = (
Path(base_path) if isinstance(base_path, str) else base_path
)
# Find all directories matching the pattern
datasets = [
d.name
for d in base_path_obj.iterdir()
if d.is_dir() and re.match(pattern, d.name)
]
# Apply exclusion patterns
if exclude_patterns:
for exclude in exclude_patterns:
datasets = [ds for ds in datasets if not re.match(exclude, ds)]
# Store original dataset names
original_datasets = datasets.copy()
# Apply regex substitutions to get new names
if substitutions:
for sub in substitutions:
datasets = [
re.sub(sub["pattern"], sub["repl"], ds) for ds in datasets
]
datasets = sorted(datasets)
original_datasets = sorted(original_datasets)
# Discover TIFF files for each dataset
tiff_files: Dict[str, Dict[int, List[str]]] = {}
tiff_files_flat = []
for dataset in original_datasets:
dataset_path = base_path_obj / dataset
# check if there is at least one tiff in the dataset
if not any(dataset_path.rglob("*.tif")):
logging.info(f"No tiff files found in {dataset_path}")
idx = datasets.index(dataset)
datasets.pop(idx)
original_datasets.pop(idx)
continue
# Initialize the dataset entry with all sessions
tiff_files[dataset] = {}
for session, tiff_pattern in enumerate(tiff_patterns):
logging.debug(
f"Searching for tiff files in {dataset_path} with pattern "
f"{tiff_pattern}"
)
files_found = sorted(
[
f.name
for f in dataset_path.rglob(tiff_pattern)
if f.is_file()
]
)
if not files_found:
logging.info(
f"No files found for pattern {tiff_pattern} in "
f"{dataset_path}"
)
# Initialize empty list for this session
tiff_files[dataset][session] = []
else:
tiff_files[dataset][session] = files_found
tiff_files_flat.extend(files_found)
return original_datasets, datasets, tiff_files, tiff_files_flat