#!/usr/bin/env python3 import pandas as pd import numpy as np # Load the data df = pd.read_csv('organization_happiness_study_data.csv') print("=" * 70) print("DATA GENERATION IMPROVEMENTS VERIFICATION") print("=" * 70) print(f"\n✓ Dataset shape: {df.shape}") print(f"✓ Total rows: {len(df)} (20 participants × 30 days × 2 groups = 1200 expected)") print("\n--- Intervention Group Statistics ---") intervention = df[df['Group'] == 'Intervention'] print(f"Participants: {intervention['Participant_ID'].nunique()}") print(f"Mean Happiness: {intervention['Happiness'].mean():.2f}") print(f"Happiness Std Dev: {intervention['Happiness'].std():.2f}") print(f"Calendar Adherence Rate: {(intervention['Calendar_Adherence'] == 'Yes').mean():.1%}") print(f"Cleanliness Adherence Rate: {(intervention['Cleanliness_Adherence'] == 'Yes').mean():.1%}") print(f"Punctuality Adherence Rate: {(intervention['Punctuality_Adherence'] == 'Yes').mean():.1%}") print("\n--- Control Group Statistics ---") control = df[df['Group'] == 'Control'] print(f"Participants: {control['Participant_ID'].nunique()}") print(f"Mean Happiness: {control['Happiness'].mean():.2f}") print(f"Happiness Std Dev: {control['Happiness'].std():.2f}") print(f"Reported Calendar: {(control['Calendar_Adherence'] == 'Yes').mean():.1%} (should be ~0%)") print(f"Reported Cleanliness: {(control['Cleanliness_Adherence'] == 'Yes').mean():.1%} (should be ~0%)") print(f"Reported Punctuality: {(control['Punctuality_Adherence'] == 'Yes').mean():.1%} (should be ~0%)") print("\n--- Natural Data Patterns ---") # Check for habit momentum (persistence) intervention['Habits_Count'] = ( (intervention['Calendar_Adherence'] == 'Yes').astype(int) + (intervention['Cleanliness_Adherence'] == 'Yes').astype(int) + (intervention['Punctuality_Adherence'] == 'Yes').astype(int) ) print(f"Habit completion rates by number completed:") for count in [0, 1, 2, 3]: subset = intervention[intervention['Habits_Count'] == count] happiness = subset['Happiness'].mean() print(f" {count} habits: Happiness = {happiness:.2f} (n={len(subset)})") # Weekend effect intervention['DayOfWeek'] = intervention['Day'] % 7 weekend = intervention[intervention['DayOfWeek'].isin([0, 6])] weekday = intervention[~intervention['DayOfWeek'].isin([0, 6])] print(f"\nWeekend vs Weekday Adherence:") print(f" Weekday avg habits: {((weekday['Calendar_Adherence']=='Yes').astype(int) + (weekday['Cleanliness_Adherence']=='Yes').astype(int) + (weekday['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}") print(f" Weekend avg habits: {((weekend['Calendar_Adherence']=='Yes').astype(int) + (weekend['Cleanliness_Adherence']=='Yes').astype(int) + (weekend['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}") # Habit formation over time first_week = intervention[intervention['Day'] <= 7] mid_month = intervention[(intervention['Day'] > 14) & (intervention['Day'] <= 21)] last_week = intervention[intervention['Day'] > 23] print(f"\nHabit Formation Over Time:") print(f" Days 1-7 (Starting): Avg habits = {((first_week['Calendar_Adherence']=='Yes').astype(int) + (first_week['Cleanliness_Adherence']=='Yes').astype(int) + (first_week['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}") print(f" Days 15-21 (Momentum): Avg habits = {((mid_month['Calendar_Adherence']=='Yes').astype(int) + (mid_month['Cleanliness_Adherence']=='Yes').astype(int) + (mid_month['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}") print(f" Days 24-30 (Late): Avg habits = {((last_week['Calendar_Adherence']=='Yes').astype(int) + (last_week['Cleanliness_Adherence']=='Yes').astype(int) + (last_week['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}") print(f"\nHappiness Persistence (day-to-day correlation):") intervention_sorted = intervention.sort_values(['Participant_ID', 'Day']) intervention_sorted['Happiness_prev'] = intervention_sorted.groupby('Participant_ID')['Happiness'].shift(1) valid = intervention_sorted[intervention_sorted['Happiness_prev'].notna()] corr = valid[['Happiness', 'Happiness_prev']].corr().iloc[0, 1] print(f" Correlation between today and yesterday's happiness: {corr:.3f}") print("\n✓ Data generation complete with natural patterns!") print("\nKey improvements:") print(" • Habit momentum: doing it yesterday makes it more likely today") print(" • Weekly patterns: lower adherence weekends vs weekdays") print(" • Habit formation: initial difficulty, momentum building, slight fatigue") print(" • Individual variation: each person has unique habit profiles") print(" • Happiness persistence: today's mood influenced by yesterday's") print(" • Control group realism: still report 'No' but data shows natural variation")