Data is available only upon formal request and subject to approval.
Approved users receive a secure institute account and work with the data exclusively in our Trusted Research Environment (TRE) via remote desktop.
Request data (Email to us)For internal purpposes we have a python shiny dashboard with a Kaplan-Meier survival cure. For validation purposes we want an epidemiologist to reproduce the results.
r_dobt_datepat_survfu_datedat_deathimport datetime
from api.client import fetch_redcap_data
import pandas as pd
import numpy as np
def get_kaplan_meier_data(start_year=2020, age_start=None, age_end=None):
"""
Calculate Kaplan-Meier survival curve from REDCap data.
Uses only the LAST instance per patient (most recent transplant).
Parameters:
-----------
start_year : int
Include only transplants from this year onwards (default: 2020)
age_start : int or None
Minimum age at transplant (inclusive)
age_end : int or None
Maximum age at transplant (inclusive)
Returns:
--------
dict with keys:
- time: list of time points (days)
- survival_probability: list of survival probabilities
- events: number of death events
- censored: number of censored patients
- total: total number of patients
- median_survival: median survival time in days (or None)
- age_stats: dict with min/max/mean/median age (NEW)
"""
try:
print(f"🔍 Fetching Kaplan-Meier data for transplants from {start_year}+")
if age_start is not None or age_end is not None:
print(f"🔍 Age filter: {age_start or 0} to {age_end or 100} years")
# Fetch transplant data
transplant_df = fetch_redcap_data(
fields=["et_nr", "t_date", "r_dob"],
forms=["et_transplant"],
events=["transplant_arm_2"]
)
outcomes_df = fetch_redcap_data(
fields=["et_nr", "pat_surv", "dat_death", "fu_date"],
forms=["transplant_outcomes"],
events=["transplant_arm_2"]
)
print(f"📊 Fetched {len(transplant_df)} transplant records")
print(f"📊 Fetched {len(outcomes_df)} outcome records")
# Filter for last instance per patient
if 'redcap_repeat_instance' in transplant_df.columns:
transplant_df['redcap_repeat_instance'] = pd.to_numeric(
transplant_df['redcap_repeat_instance'], errors='coerce'
).fillna(1)
transplant_df = transplant_df.loc[
transplant_df.groupby('et_nr')['redcap_repeat_instance'].idxmax()
]
print(f"📊 After filtering to last instance: {len(transplant_df)} transplant records")
if 'redcap_repeat_instance' in outcomes_df.columns:
outcomes_df['redcap_repeat_instance'] = pd.to_numeric(
outcomes_df['redcap_repeat_instance'], errors='coerce'
).fillna(1)
outcomes_df = outcomes_df.loc[
outcomes_df.groupby('et_nr')['redcap_repeat_instance'].idxmax()
]
print(f"📊 After filtering to last instance: {len(outcomes_df)} outcome records")
# Merge dataframes
df = transplant_df.merge(outcomes_df, on="et_nr", how="inner", suffixes=('_tx', '_outcome'))
print(f"📊 Merged data: {len(df)} records")
# Clean dates
df["t_date"] = pd.to_datetime(df["t_date"], errors="coerce")
df["dat_death"] = pd.to_datetime(df["dat_death"], errors="coerce")
df["fu_date"] = pd.to_datetime(df["fu_date"], errors="coerce")
df["r_dob"] = pd.to_datetime(df["r_dob"], errors="coerce")
# Remove invalid records
df = df.dropna(subset=["t_date", "r_dob"])
print(f"📊 After removing records without t_date/r_dob: {len(df)} records")
# ===== NEW: Calculate age at transplant =====
df["age_at_tx"] = ((df["t_date"] - df["r_dob"]).dt.days / 365.25).round(1)
print(f"📊 Age range in data: {df['age_at_tx'].min():.1f} - {df['age_at_tx'].max():.1f} years")
# Filter by year
df = df[df["t_date"].dt.year >= start_year]
print(f"📊 After year filter ({start_year}+): {len(df)} records")
# ===== NEW: Age filter (REPLACES DOB filter) =====
if age_start is not None:
df = df[df["age_at_tx"] >= age_start]
print(f"📊 After age_start filter (>={age_start}): {len(df)} records")
if age_end is not None:
df = df[df["age_at_tx"] <= age_end]
print(f"📊 After age_end filter (<={age_end}): {len(df)} records")
# Validate we still have data
if df.empty:
print("⚠️ No patients match the age criteria")
return {
'time': [0],
'survival_probability': [1.0],
'events': 0,
'censored': 0,
'total': 0,
'median_survival': None,
'age_stats': {'min': None, 'max': None, 'mean': None, 'median': None, 'n': 0}
}
# Calculate age statistics for display
age_stats = {
'min': float(df['age_at_tx'].min()),
'max': float(df['age_at_tx'].max()),
'mean': float(df['age_at_tx'].mean()),
'median': float(df['age_at_tx'].median()),
'n': len(df)
}
# [Rest of KM calculation - UNVERÄNDERT]
df["pat_surv"] = pd.to_numeric(df["pat_surv"], errors="coerce")
df = df[df["pat_surv"].notna()]
print(f"📊 After filtering for valid pat_surv: {len(df)} records")
survival_times = []
events = []
for idx, row in df.iterrows():
t_date = row["t_date"]
pat_surv = row["pat_surv"]
dat_death = row["dat_death"]
fu_date = row["fu_date"]
if pat_surv == 1 and pd.notna(dat_death):
survival_time = (dat_death - t_date).days
event = 1
elif pat_surv == 0 and pd.notna(fu_date):
survival_time = (fu_date - t_date).days
event = 0
elif pat_surv == 2:
if pd.notna(fu_date):
survival_time = (fu_date - t_date).days
else:
survival_time = 0
event = 0
else:
continue
if survival_time >= 0:
survival_times.append(survival_time)
events.append(event)
survival_times = np.array(survival_times)
events = np.array(events)
n_patients = len(survival_times)
n_events = int(np.sum(events))
n_censored = int(n_patients - n_events)
print(f"📊 Final cohort: {n_patients} patients ({n_events} events, {n_censored} censored)")
print(f"📊 Age stats: mean={age_stats['mean']:.1f}, median={age_stats['median']:.1f}")
if n_patients == 0:
return {
'time': [0],
'survival_probability': [1.0],
'events': 0,
'censored': 0,
'total': 0,
'median_survival': None,
'age_stats': age_stats
}
if n_events == 0:
max_time = float(np.max(survival_times))
return {
'time': [0, max_time],
'survival_probability': [1.0, 1.0],
'events': 0,
'censored': n_censored,
'total': n_patients,
'median_survival': None,
'age_stats': age_stats
}
# KM calculation
unique_times = np.sort(np.unique(survival_times))
time_points = [0]
survival_prob = [1.0]
n_at_risk = n_patients
for t in unique_times:
n_events_at_t = np.sum((survival_times == t) & (events == 1))
n_censored_at_t = np.sum((survival_times == t) & (events == 0))
if n_events_at_t > 0:
if n_at_risk <= 0:
print(f"⚠️ Warning: n_at_risk became {n_at_risk} at time {t}, stopping calculation")
break
survival_prob.append(survival_prob[-1] * (1 - n_events_at_t / n_at_risk))
time_points.append(float(t))
n_at_risk -= (n_events_at_t + n_censored_at_t)
if len(survival_times) > 0:
max_time = float(np.max(survival_times))
if time_points[-1] < max_time:
time_points.append(max_time)
survival_prob.append(survival_prob[-1])
median_survival = None
for i, prob in enumerate(survival_prob):
if prob < 0.5:
median_survival = time_points[i]
break
print(f"✅ Kaplan-Meier curve calculated: {len(time_points)} time points")
print(f"📊 Median survival: {median_survival if median_survival else 'Not reached'} days")
return {
'time': time_points,
'survival_probability': survival_prob,
'events': n_events,
'censored': n_censored,
'total': n_patients,
'median_survival': median_survival,
'age_stats': age_stats # NEW
}
except Exception as e:
print(f"❌ Error calculating Kaplan-Meier curve: {e}")
import traceback
traceback.print_exc()
return {
'time': [0],
'survival_probability': [1.0],
'events': 0,
'censored': 0,
'total': 0,
'median_survival': None,
'age_stats': {'min': None, 'max': None, 'mean': None, 'median': None, 'n': 0}
}
def get_kaplan_meier_by_age_groups(start_year=2020):
"""
Calculate Kaplan-Meier curves for multiple age groups simultaneously.
Age groups:
- 0-15 years (Pediatric)
- 16-34 years (Young Adult)
- 35-49 years (Middle Age)
- 50-64 years (Senior)
- 65+ years (Elderly)
Returns:
--------
dict with keys for each age group:
'age_groups': list of dicts, each containing:
- 'label': str (e.g., "0-15 [N=3]")
- 'age_min': int
- 'age_max': int (or None for 65+)
- 'n': int (number of patients)
- 'time': list of time points
- 'survival_probability': list
- 'events': int
- 'censored': int
- 'color': str (hex color for plotting)
"""
try:
print(f"🔍 Calculating multi-cohort KM for age groups from {start_year}+")
# Define age groups
age_groups_config = [
{'min': 0, 'max': 15, 'label': '0-15', 'color': '#FF6B6B'}, # Red
{'min': 16, 'max': 34, 'label': '16-34', 'color': '#4ECDC4'}, # Teal
{'min': 35, 'max': 49, 'label': '35-49', 'color': '#45B7D1'}, # Blue
{'min': 50, 'max': 64, 'label': '50-64', 'color': '#FFA07A'}, # Orange
{'min': 65, 'max': None, 'label': '≥65', 'color': '#95E1D3'}, # Light green
]
results = []
for group in age_groups_config:
age_min = group['min']
age_max = group['max']
# Call existing KM function for this age group
km_data = get_kaplan_meier_data(
start_year=start_year,
age_start=age_min,
age_end=age_max if age_max is not None else 100
)
# Create label with count
n_patients = km_data['total']
label = f"{group['label']} [N={n_patients}]"
# Only include groups with patients
if n_patients > 0:
results.append({
'label': label,
'age_min': age_min,
'age_max': age_max,
'n': n_patients,
'time': km_data['time'],
'survival_probability': km_data['survival_probability'],
'events': km_data['events'],
'censored': km_data['censored'],
'median_survival': km_data['median_survival'],
'color': group['color']
})
print(f"✅ {label}: {km_data['events']} events, {km_data['censored']} censored")
else:
print(f"⚠️ {group['label']}: No patients in this age group")
print(f"✅ Multi-cohort KM calculated for {len(results)} age groups")
return {
'age_groups': results,
'total_groups': len(results)
}
except Exception as e:
print(f"❌ Error calculating multi-cohort KM: {e}")
import traceback
traceback.print_exc()
return {
'age_groups': [],
'total_groups': 0
}
| Version | Language | Type | Relation | Author | Date |
|---|---|---|---|---|---|
| Global v1 (Python v1) selected | Python | Single Script | Initial Implementation | mmueller | 2026-01-15 |