import pandas as pd
import numpy as np


np.random.seed(64)  # ensures you get exactly the same data every time


N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))


def clip_yes_prob(prob, ceiling):
    return min(ceiling, max(0.05, prob))


def generate_intervention_group(start_participant_id=1):
    rows = []
    for offset in range(N_PARTICIPANTS_PER_GROUP):
        participant_id = start_participant_id + offset
        org_bias = np.random.normal(0.65, 0.18)  # each person has their own organization tendency (persistent)
        org_bias = np.clip(org_bias, 0.1, 0.95)
        
        # Personal baselines for each habit (people are naturally better/worse at specific habits)
        calendar_ease = org_bias + np.random.normal(0.05, 0.08)
        clean_ease = org_bias + np.random.normal(-0.02, 0.08)
        ontime_ease = org_bias + np.random.normal(0.02, 0.08)
        
        # Baseline happiness for this participant
        person_happiness_baseline = np.random.normal(5.5, 1.2)
        current_happiness = person_happiness_baseline
        
        # Track previous day's habits for momentum/habit stacking
        prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'

        for day in DAYS:
            # Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
            week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75  # weekends are harder
            
            # Habit formation/fatigue: early days harder, then easier, slight decline late
            if day < 7:
                time_factor = 0.85  # Getting started is harder
            elif day < 20:
                time_factor = 1.1   # Momentum builds
            else:
                time_factor = 0.98  # Slight fatigue
            
            # Momentum effect: If you did a habit yesterday, you're more likely to do it today
            calendar_prob = clip_yes_prob(
                calendar_ease * week_difficulty * time_factor + 
                (0.15 if prev_calendar == 'Yes' else 0), 0.95
            )
            clean_prob = clip_yes_prob(
                clean_ease * week_difficulty * time_factor + 
                (0.15 if prev_clean == 'Yes' else 0), 0.90
            )
            ontime_prob = clip_yes_prob(
                ontime_ease * week_difficulty * time_factor + 
                (0.12 if prev_ontime == 'Yes' else 0), 0.93
            )
            
            calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
            clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
            ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
            
            # Habit stacking: completing one habit makes the next easier
            adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
            habit_boost = adherence_count * 1.2 if adherence_count > 0 else 0
            
            # Happiness has persistence but is also affected by habits
            happiness_noise = np.random.normal(0, 1.3)
            current_happiness = np.clip(
                current_happiness * 0.4 +  # Previous day influences today
                person_happiness_baseline * 0.4 +
                habit_boost * 0.9 +  # Habits have strong effect
                happiness_noise,
                1, 10
            )
            happiness = int(np.round(current_happiness))
            
            rows.append([
                participant_id,
                'Intervention',
                day,
                calendar,
                clean,
                ontime,
                happiness,
            ])
            
            # Update for next iteration
            prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime

    return rows


def generate_control_group(start_participant_id):
    rows = []
    for offset in range(N_PARTICIPANTS_PER_GROUP):
        participant_id = start_participant_id + offset
        
        # Even without tracking, some people are naturally more organized
        natural_org = np.random.normal(0.3, 0.15)  # Lower baseline than intervention
        natural_org = np.clip(natural_org, 0.05, 0.7)
        
        # Personal tendencies (but not tracked/reported as habits)
        person_happiness_baseline = np.random.normal(5.0, 1.3)  # Slightly lower baseline for control
        current_happiness = person_happiness_baseline
        
        # Since they're not tracking, habits happen at random intervals (not streaky)
        prev_untracked_habits = 0

        for day in DAYS:
            # Week effect: sans the awareness/tracking effect
            week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
            
            # Without tracking, unaware of patterns, so less habit formation
            time_factor = 1.0 + (day / 100) * 0.1  # Tiny habituation, but weak
            
            # Untracked habits - they happen but aren't reported
            calendar_untracked = np.random.choice(['Yes', 'No'], 
                p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4), 
                   1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
            clean_untracked = np.random.choice(['Yes', 'No'], 
                p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
                   1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
            ontime_untracked = np.random.choice(['Yes', 'No'], 
                p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
                   1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
            
            # They report habits as "No" (not tracking), but it still affects their happiness sublimely
            untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
            subtle_boost = untracked_count * 0.5  # Smaller effect since unaware/untracked
            
            # Control group happiness is less affected by daily habits and more random
            happiness_noise = np.random.normal(0, 1.6)  # Higher variability since no tracking
            current_happiness = np.clip(
                current_happiness * 0.5 +
                person_happiness_baseline * 0.5 +
                subtle_boost +
                happiness_noise,
                1, 10
            )
            happiness = int(np.round(current_happiness))

            rows.append([
                participant_id,
                'Control',
                day,
                'No',  # Reported as "No" - not tracking
                'No',  # Reported as "No" - not tracking
                'No',  # Reported as "No" - not tracking
                happiness,
            ])
            
            prev_untracked_habits = untracked_count

    return rows


data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))

df = pd.DataFrame(
    data,
    columns=[
        'Participant_ID',
        'Group',
        'Day',
        'Calendar_Adherence',
        'Cleanliness_Adherence',
        'Punctuality_Adherence',
        'Happiness',
    ],
)

# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10))  # shows first 10 rows