Kaplan-Meier Validation

Dataset Info
Published on
2026-01-15
Variables
Data Access
Data is available only upon formal request and subject to approval.
Approved users receive a secure institute account and work with the data exclusively in our Trusted Research Environment (TRE) via remote desktop.
Request data (Email to us)
Reuse & Usage Terms
Data is not downloadable (TRE access only).
Approved users receive a personal institute account.
Tools available: RStudio, Jupyter, Python, Stata, etc.
Data resides in your TRE home directory.
Re-use/publication per Data Use Agreement (DUA).
No redistribution of the data.
Description
For internal purpposes we have a python shiny dashboard with a Kaplan-Meier survival cure. For validation purposes we want an epidemiologist to reproduce the results.
Available Variables (5)
Event: Transplant
ET Transplant
r_dob
t_date
Transplant outcomes
pat_surv
fu_date
dat_death
Analysis Code
Viewing: v1 Python Single Script
Viewing version: v1 (Python)
Created by mmueller · 2026-01-15 18:00
📄 Script (v1)
import datetime
from api.client import fetch_redcap_data
import pandas as pd
import numpy as np

def get_kaplan_meier_data(start_year=2020, age_start=None, age_end=None):
    """
    Calculate Kaplan-Meier survival curve from REDCap data.
    Uses only the LAST instance per patient (most recent transplant).
    
    Parameters:
    -----------
    start_year : int
        Include only transplants from this year onwards (default: 2020)
    age_start : int or None
        Minimum age at transplant (inclusive)
    age_end : int or None
        Maximum age at transplant (inclusive)
    
    Returns:
    --------
    dict with keys:
        - time: list of time points (days)
        - survival_probability: list of survival probabilities
        - events: number of death events
        - censored: number of censored patients
        - total: total number of patients
        - median_survival: median survival time in days (or None)
        - age_stats: dict with min/max/mean/median age (NEW)
    """
    try:
        print(f"🔍 Fetching Kaplan-Meier data for transplants from {start_year}+")
        if age_start is not None or age_end is not None:
            print(f"🔍 Age filter: {age_start or 0} to {age_end or 100} years")
        
        # Fetch transplant data
        transplant_df = fetch_redcap_data(
            fields=["et_nr", "t_date", "r_dob"],
            forms=["et_transplant"],
            events=["transplant_arm_2"]
        )
        
        outcomes_df = fetch_redcap_data(
            fields=["et_nr", "pat_surv", "dat_death", "fu_date"],
            forms=["transplant_outcomes"],
            events=["transplant_arm_2"]
        )
        
        print(f"📊 Fetched {len(transplant_df)} transplant records")
        print(f"📊 Fetched {len(outcomes_df)} outcome records")
        
        # Filter for last instance per patient
        if 'redcap_repeat_instance' in transplant_df.columns:
            transplant_df['redcap_repeat_instance'] = pd.to_numeric(
                transplant_df['redcap_repeat_instance'], errors='coerce'
            ).fillna(1)
            transplant_df = transplant_df.loc[
                transplant_df.groupby('et_nr')['redcap_repeat_instance'].idxmax()
            ]
            print(f"📊 After filtering to last instance: {len(transplant_df)} transplant records")
        
        if 'redcap_repeat_instance' in outcomes_df.columns:
            outcomes_df['redcap_repeat_instance'] = pd.to_numeric(
                outcomes_df['redcap_repeat_instance'], errors='coerce'
            ).fillna(1)
            outcomes_df = outcomes_df.loc[
                outcomes_df.groupby('et_nr')['redcap_repeat_instance'].idxmax()
            ]
            print(f"📊 After filtering to last instance: {len(outcomes_df)} outcome records")
        
        # Merge dataframes
        df = transplant_df.merge(outcomes_df, on="et_nr", how="inner", suffixes=('_tx', '_outcome'))
        print(f"📊 Merged data: {len(df)} records")
        
        # Clean dates
        df["t_date"] = pd.to_datetime(df["t_date"], errors="coerce")
        df["dat_death"] = pd.to_datetime(df["dat_death"], errors="coerce")
        df["fu_date"] = pd.to_datetime(df["fu_date"], errors="coerce")
        df["r_dob"] = pd.to_datetime(df["r_dob"], errors="coerce")
        
        # Remove invalid records
        df = df.dropna(subset=["t_date", "r_dob"])
        print(f"📊 After removing records without t_date/r_dob: {len(df)} records")
        
        # ===== NEW: Calculate age at transplant =====
        df["age_at_tx"] = ((df["t_date"] - df["r_dob"]).dt.days / 365.25).round(1)
        print(f"📊 Age range in data: {df['age_at_tx'].min():.1f} - {df['age_at_tx'].max():.1f} years")
        
        # Filter by year
        df = df[df["t_date"].dt.year >= start_year]
        print(f"📊 After year filter ({start_year}+): {len(df)} records")
        
        # ===== NEW: Age filter (REPLACES DOB filter) =====
        if age_start is not None:
            df = df[df["age_at_tx"] >= age_start]
            print(f"📊 After age_start filter (>={age_start}): {len(df)} records")
        
        if age_end is not None:
            df = df[df["age_at_tx"] <= age_end]
            print(f"📊 After age_end filter (<={age_end}): {len(df)} records")
        
        # Validate we still have data
        if df.empty:
            print("⚠️ No patients match the age criteria")
            return {
                'time': [0],
                'survival_probability': [1.0],
                'events': 0,
                'censored': 0,
                'total': 0,
                'median_survival': None,
                'age_stats': {'min': None, 'max': None, 'mean': None, 'median': None, 'n': 0}
            }
        
        # Calculate age statistics for display
        age_stats = {
            'min': float(df['age_at_tx'].min()),
            'max': float(df['age_at_tx'].max()),
            'mean': float(df['age_at_tx'].mean()),
            'median': float(df['age_at_tx'].median()),
            'n': len(df)
        }
        
        # [Rest of KM calculation - UNVERÄNDERT]
        df["pat_surv"] = pd.to_numeric(df["pat_surv"], errors="coerce")
        df = df[df["pat_surv"].notna()]
        print(f"📊 After filtering for valid pat_surv: {len(df)} records")
        
        survival_times = []
        events = []
        
        for idx, row in df.iterrows():
            t_date = row["t_date"]
            pat_surv = row["pat_surv"]
            dat_death = row["dat_death"]
            fu_date = row["fu_date"]
            
            if pat_surv == 1 and pd.notna(dat_death):
                survival_time = (dat_death - t_date).days
                event = 1
            elif pat_surv == 0 and pd.notna(fu_date):
                survival_time = (fu_date - t_date).days
                event = 0
            elif pat_surv == 2:
                if pd.notna(fu_date):
                    survival_time = (fu_date - t_date).days
                else:
                    survival_time = 0
                event = 0
            else:
                continue
            
            if survival_time >= 0:
                survival_times.append(survival_time)
                events.append(event)
        
        survival_times = np.array(survival_times)
        events = np.array(events)
        
        n_patients = len(survival_times)
        n_events = int(np.sum(events))
        n_censored = int(n_patients - n_events)
        
        print(f"📊 Final cohort: {n_patients} patients ({n_events} events, {n_censored} censored)")
        print(f"📊 Age stats: mean={age_stats['mean']:.1f}, median={age_stats['median']:.1f}")
        
        if n_patients == 0:
            return {
                'time': [0],
                'survival_probability': [1.0],
                'events': 0,
                'censored': 0,
                'total': 0,
                'median_survival': None,
                'age_stats': age_stats
            }
        
        if n_events == 0:
            max_time = float(np.max(survival_times))
            return {
                'time': [0, max_time],
                'survival_probability': [1.0, 1.0],
                'events': 0,
                'censored': n_censored,
                'total': n_patients,
                'median_survival': None,
                'age_stats': age_stats
            }
        
        # KM calculation
        unique_times = np.sort(np.unique(survival_times))
        time_points = [0]
        survival_prob = [1.0]
        n_at_risk = n_patients
        
        for t in unique_times:
            n_events_at_t = np.sum((survival_times == t) & (events == 1))
            n_censored_at_t = np.sum((survival_times == t) & (events == 0))
            
            if n_events_at_t > 0:
                if n_at_risk <= 0:
                    print(f"⚠️ Warning: n_at_risk became {n_at_risk} at time {t}, stopping calculation")
                    break
                survival_prob.append(survival_prob[-1] * (1 - n_events_at_t / n_at_risk))
                time_points.append(float(t))
            
            n_at_risk -= (n_events_at_t + n_censored_at_t)
        
        if len(survival_times) > 0:
            max_time = float(np.max(survival_times))
            if time_points[-1] < max_time:
                time_points.append(max_time)
                survival_prob.append(survival_prob[-1])
        
        median_survival = None
        for i, prob in enumerate(survival_prob):
            if prob < 0.5:
                median_survival = time_points[i]
                break
        
        print(f"✅ Kaplan-Meier curve calculated: {len(time_points)} time points")
        print(f"📊 Median survival: {median_survival if median_survival else 'Not reached'} days")
        
        return {
            'time': time_points,
            'survival_probability': survival_prob,
            'events': n_events,
            'censored': n_censored,
            'total': n_patients,
            'median_survival': median_survival,
            'age_stats': age_stats  # NEW
        }
        
    except Exception as e:
        print(f"❌ Error calculating Kaplan-Meier curve: {e}")
        import traceback
        traceback.print_exc()
        return {
            'time': [0],
            'survival_probability': [1.0],
            'events': 0,
            'censored': 0,
            'total': 0,
            'median_survival': None,
            'age_stats': {'min': None, 'max': None, 'mean': None, 'median': None, 'n': 0}
        }

def get_kaplan_meier_by_age_groups(start_year=2020):
    """
    Calculate Kaplan-Meier curves for multiple age groups simultaneously.
    
    Age groups:
    - 0-15 years (Pediatric)
    - 16-34 years (Young Adult)
    - 35-49 years (Middle Age)
    - 50-64 years (Senior)
    - 65+ years (Elderly)
    
    Returns:
    --------
    dict with keys for each age group:
        'age_groups': list of dicts, each containing:
            - 'label': str (e.g., "0-15 [N=3]")
            - 'age_min': int
            - 'age_max': int (or None for 65+)
            - 'n': int (number of patients)
            - 'time': list of time points
            - 'survival_probability': list
            - 'events': int
            - 'censored': int
            - 'color': str (hex color for plotting)
    """
    try:
        print(f"🔍 Calculating multi-cohort KM for age groups from {start_year}+")
        
        # Define age groups
        age_groups_config = [
            {'min': 0, 'max': 15, 'label': '0-15', 'color': '#FF6B6B'},      # Red
            {'min': 16, 'max': 34, 'label': '16-34', 'color': '#4ECDC4'},    # Teal
            {'min': 35, 'max': 49, 'label': '35-49', 'color': '#45B7D1'},    # Blue
            {'min': 50, 'max': 64, 'label': '50-64', 'color': '#FFA07A'},    # Orange
            {'min': 65, 'max': None, 'label': '≥65', 'color': '#95E1D3'},    # Light green
        ]
        
        results = []
        
        for group in age_groups_config:
            age_min = group['min']
            age_max = group['max']
            
            # Call existing KM function for this age group
            km_data = get_kaplan_meier_data(
                start_year=start_year,
                age_start=age_min,
                age_end=age_max if age_max is not None else 100
            )
            
            # Create label with count
            n_patients = km_data['total']
            label = f"{group['label']} [N={n_patients}]"
            
            # Only include groups with patients
            if n_patients > 0:
                results.append({
                    'label': label,
                    'age_min': age_min,
                    'age_max': age_max,
                    'n': n_patients,
                    'time': km_data['time'],
                    'survival_probability': km_data['survival_probability'],
                    'events': km_data['events'],
                    'censored': km_data['censored'],
                    'median_survival': km_data['median_survival'],
                    'color': group['color']
                })
                
                print(f"✅ {label}: {km_data['events']} events, {km_data['censored']} censored")
            else:
                print(f"⚠️ {group['label']}: No patients in this age group")
        
        print(f"✅ Multi-cohort KM calculated for {len(results)} age groups")
        
        return {
            'age_groups': results,
            'total_groups': len(results)
        }
        
    except Exception as e:
        print(f"❌ Error calculating multi-cohort KM: {e}")
        import traceback
        traceback.print_exc()
        return {
            'age_groups': [],
            'total_groups': 0
        }
Version Timeline (by language)
PYTHON
Version History (detailed)
Version	Language	Type	Relation	Author	Date
Global v1 (Python v1) selected	Python	Single Script	Initial Implementation	mmueller	2026-01-15
Contact
Marcel Müller
Email
Publisher
Project
Liver Transplant