import pandas as pd import numpy as np np.random.seed(43) # ensures you get exactly the same data every time N_PARTICIPANTS_PER_GROUP = 40 DAYS = list(range(1, 31)) def clip_yes_prob(prob, ceiling): return min(ceiling, max(0.05, prob)) def generate_intervention_group(start_participant_id=1): rows = [] for offset in range(N_PARTICIPANTS_PER_GROUP): participant_id = start_participant_id + offset org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent) org_bias = np.clip(org_bias, 0.1, 0.95) # Personal baselines for each habit (people are naturally better/worse at specific habits) calendar_ease = org_bias + np.random.normal(0.05, 0.08) clean_ease = org_bias + np.random.normal(-0.02, 0.08) ontime_ease = org_bias + np.random.normal(0.02, 0.08) # Baseline happiness and habit strength for this participant person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range) habit_strength = 0.0 # Cumulative measure of consistent habit completion # Track previous day's habits for momentum/habit stacking prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No' for day in DAYS: # Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder # Habit formation/fatigue: early days harder, then easier, slight decline late if day < 7: time_factor = 0.85 # Getting started is harder elif day < 20: time_factor = 1.1 # Momentum builds else: time_factor = 0.98 # Slight fatigue # Momentum effect: If you did a habit yesterday, you're more likely to do it today calendar_prob = clip_yes_prob( calendar_ease * week_difficulty * time_factor + (0.15 if prev_calendar == 'Yes' else 0), 0.95 ) clean_prob = clip_yes_prob( clean_ease * week_difficulty * time_factor + (0.15 if prev_clean == 'Yes' else 0), 0.90 ) ontime_prob = clip_yes_prob( ontime_ease * week_difficulty * time_factor + (0.12 if prev_ontime == 'Yes' else 0), 0.93 ) calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob]) clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob]) ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob]) # Count habits completed today adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime]) # Habit strength: accumulates with consistent completion, decays with non-completion # This creates a cumulative effect that drives upward trend if adherence_count == 3: habit_strength += 0.6 # Strong boost for completing all habits elif adherence_count == 2: habit_strength += 0.35 # Moderate boost elif adherence_count == 1: habit_strength += 0.15 # Small boost else: habit_strength -= 0.2 # Small decay for missing all habits # Clip habit_strength to reasonable range (0 to 5) habit_strength = np.clip(habit_strength, 0, 5) # Happiness combines DAILY habits effect + cumulative habit strength study_progress = day / 30.0 # 0.033 to 1.0 daily_noise = np.random.normal(0, 0.35) # Immediate bonus for today's habits (strong, clear dose-response) daily_habit_bonus = adherence_count * 0.6 # 0-1.8 based on today's habits # Cumulative bonus grows as study progresses cumulative_bonus = habit_strength * (0.4 + study_progress * 0.2) # max ~2.7 # Happiness formula: baseline + daily effect + cumulative effect + noise happiness_value = ( person_happiness_baseline + # Starting point (4.0) daily_habit_bonus + # Today's habits (0-1.8) cumulative_bonus + # Study progress bonus (0-2.7) daily_noise # Variability ) happiness = int(np.clip(np.round(happiness_value), 1, 10)) rows.append([ participant_id, 'Intervention', day, calendar, clean, ontime, happiness, ]) # Update for next iteration prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime return rows def generate_control_group(start_participant_id): rows = [] for offset in range(N_PARTICIPANTS_PER_GROUP): participant_id = start_participant_id + offset # Even without tracking, some people are naturally more organized natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention natural_org = np.clip(natural_org, 0.05, 0.7) # Personal tendencies (but not tracked/reported as habits) person_happiness_baseline = np.random.normal(5.1, 0.9) # Center control around ~5 # Since they're not tracking, habits happen at random intervals (not streaky) prev_untracked_habits = 0 for day in DAYS: # Week effect: sans the awareness/tracking effect week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9 # Without tracking, unaware of patterns, so less habit formation time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak # Untracked habits - they happen but aren't reported calendar_untracked = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4), 1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)]) clean_untracked = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35), 1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)]) ontime_untracked = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45), 1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)]) # They report habits as "No" (not tracking), but untracked habits have minimal effect untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked]) subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked # Control group happiness has day-to-day variability but no systematic growth # Without awareness and tracking, there's no cumulative benefit daily_noise = np.random.normal(0, 1.0) happiness_value = ( person_happiness_baseline + # Same baseline subtle_boost + # Minimal benefit from occasional habits daily_noise # Higher variability, no systematic trend ) happiness = int(np.clip(np.round(happiness_value), 1, 10)) rows.append([ participant_id, 'Control', day, 'No', # Reported as "No" - not tracking 'No', # Reported as "No" - not tracking 'No', # Reported as "No" - not tracking happiness, ]) prev_untracked_habits = untracked_count return rows data = [] data.extend(generate_intervention_group(start_participant_id=1)) data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1)) df = pd.DataFrame( data, columns=[ 'Participant_ID', 'Group', 'Day', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence', 'Happiness', ], ) # Save the combined dataset df.to_csv('organization_happiness_study_data.csv', index=False) print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!") print(df.head(10)) # shows first 10 rows