import argparse import os from pathlib import Path import logging import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats import statsmodels.api as sm import statsmodels.formula.api as smf logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') def load_data(path): df = pd.read_csv(path) logging.info("Loaded %d rows from %s", len(df), path) return df def prepare_data(df): # Ensure required columns exist required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'} missing = required - set(df.columns) if missing: raise KeyError(f"Missing required columns: {missing}") if 'Group' not in df.columns: df['Group'] = 'Intervention' df['Group'] = df['Group'].astype(str).str.strip().str.title() # Normalize adherence to boolean (Yes/No or True/False) for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']: df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False}) # Count habits per row df['Habits_Count'] = ( df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1) ) # Coerce Happiness to numeric and drop rows without Happiness df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce') before = len(df) df = df.dropna(subset=['Happiness']) logging.info('Dropped %d rows without numeric Happiness', before - len(df)) return df def descriptive_stats(df): print('Dataset shape:', df.shape) print('\nOverall summary:') print(df['Happiness'].describe()) if 'Group' in df.columns: print('\nRows by group:') print(df['Group'].value_counts()) print('\nAverage happiness by group:') print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3)) print('\nAverage happiness by number of habits completed:') print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3)) print('\nMedian happiness by habits:') print(df.groupby('Habits_Count')['Happiness'].median()) # Correlations print('\nPearson correlation between Habits_Count and Happiness:') print(df[['Habits_Count', 'Happiness']].corr().round(3)) print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):') habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']: mask = ~habit_df[habit].isna() if mask.sum() == 0: print(f'{habit:22} (no data)') continue r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness']) print(f"{habit:22} r = {r:.3f} p = {p:.4f}") def cohen_d(x, y): # Cohen's d for two independent samples nx, ny = len(x), len(y) dof = nx + ny - 2 pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof) return (x.mean() - y.mean()) / pooled_sd def run_ols(df): if 'Group' in df.columns: model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit() print('\nOLS regression: Happiness ~ Habits_Count + Group') else: X = sm.add_constant(df['Habits_Count']) y = df['Happiness'] model = sm.OLS(y, X).fit() print('\nSimple OLS regression: Happiness ~ Habits_Count') print(model.summary()) return model def run_mixedlm(df): # Random intercept for Participant_ID try: md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID']) mdf = md.fit(reml=False) print('\nMixed-effects model (random intercept by Participant_ID):') print(mdf.summary()) return mdf except Exception as e: logging.warning('MixedLM failed: %s', e) return None def make_plots(df, outdir, show_plots=False): outdir = Path(outdir) outdir.mkdir(parents=True, exist_ok=True) sns.set_theme(style='whitegrid', context='talk') def finish_plot(filename): plt.tight_layout() plt.savefig(outdir / filename, dpi=200, bbox_inches='tight') if show_plots: plt.show() plt.close() # 1) PRIMARY OUTCOME: Mean happiness by group with error bars and value labels if 'Group' in df.columns: plt.figure(figsize=(8, 6)) summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention']) ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count'])) bars = plt.bar( np.arange(len(summary)), summary['mean'].values, yerr=ci95.values, color=['#A9B2C3', '#4E79A7'], capsize=8, edgecolor='black', linewidth=1.2, alpha=0.9 ) plt.xticks(np.arange(len(summary)), ['Control Group\n(No habits tracked)', 'Intervention Group\n(Daily habits tracked)']) plt.title('Effect of Tracked Organization Habits on Happiness', pad=15, fontsize=14, fontweight='bold') plt.ylabel('Mean Daily Happiness Score (1-10)', fontsize=12) plt.ylim(1, 10) for bar in bars: yval = bar.get_height() plt.text(bar.get_x() + bar.get_width()/2, yval - 0.8, f'{yval:.1f}', ha='center', va='center', color='white', fontweight='bold', fontsize=11) finish_plot('01_primary_outcome_group_comparison.png') # 2) DISTRIBUTIONS: Show overlap and variability in happiness scores if 'Group' in df.columns: plt.figure(figsize=(9, 6)) order = ['Control', 'Intervention'] sns.violinplot( data=df, x='Group', y='Happiness', order=order, inner='quartile', palette={'Control': '#E0E0E0', 'Intervention': '#B3CDE3'}, cut=0 ) sns.stripplot( data=df, x='Group', y='Happiness', order=order, color='black', alpha=0.12, jitter=0.25, size=3 ) plt.title('Distribution of Happiness Reports Over 30 Days', pad=15, fontsize=14, fontweight='bold') plt.xlabel('Study Group', fontsize=12) plt.ylabel('Happiness Score', fontsize=12) plt.ylim(1, 10) finish_plot('02_happiness_distribution_by_group.png') # 3) LONGITUDINAL: Daily happiness trend across 30 days if 'Group' in df.columns and 'Day' in df.columns: plt.figure(figsize=(10, 6)) daily_mean = df.groupby(['Group', 'Day'])['Happiness'].mean().reset_index() sns.lineplot( data=daily_mean, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], palette={'Control': '#7F7F7F', 'Intervention': '#D62728'}, marker='o', linewidth=2.5, markersize=6 ) plt.title('Longitudinal Daily Happiness Throughout the Study', pad=15, fontsize=14, fontweight='bold') plt.xlabel('Day of Study (1-30)', fontsize=12) plt.ylabel('Average Happiness', fontsize=12) plt.ylim(1, 10) plt.xticks(range(1, 31, 2)) plt.legend(title='', frameon=True, facecolor='white', fontsize=10) finish_plot('03_longitudinal_trends.png') # 4) DOSE-RESPONSE: In intervention group, does MORE habits = MORE happiness? intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df plt.figure(figsize=(9, 6)) sns.boxplot( data=intervention_df, x='Habits_Count', y='Happiness', color='#9ECAE1', width=0.6, fliersize=0 ) sns.stripplot( data=intervention_df, x='Habits_Count', y='Happiness', color='#2B5B84', alpha=0.3, jitter=0.2, size=4 ) plt.title('Dose-Response: Happiness by Number of Habits Completed', pad=15, fontsize=14, fontweight='bold') plt.xlabel('Number of Requested Habits Completed That Day\n(Calendar + Clean Room + Punctual)', fontsize=11) plt.ylabel('Happiness Score', fontsize=12) plt.ylim(1, 10) finish_plot('04_habit_dose_response.png') # 5) HABIT COMPLETION RATES: Which habits were easiest to maintain? habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'] adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index() adherence_rates.columns = ['Habit', 'Rate'] adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False) plt.figure(figsize=(8, 6)) bars = sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51') plt.title('Which Habits Were Easiest to Keep?', pad=15, fontsize=14, fontweight='bold') plt.xlabel('', fontsize=12) plt.ylabel('Percentage of Days Completed', fontsize=12) plt.ylim(0, 1.05) plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0)) for bar in bars.patches: plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{bar.get_height()*100:.0f}%", ha='center', va='bottom', fontweight='bold', fontsize=10) finish_plot('05_habit_completion_rates.png') # 6) INDIVIDUAL VARIATION: Participant-level averages show broad effect if 'Group' in df.columns: plt.figure(figsize=(12, 6)) participant_avg = df.groupby(['Group', 'Participant_ID'])['Happiness'].mean().reset_index() participant_avg = participant_avg.sort_values(['Group', 'Happiness']) participant_avg['Order_Index'] = range(len(participant_avg)) for group, color in zip(['Control', 'Intervention'], ['#BDBDBD', '#4E79A7']): group_data = participant_avg[participant_avg['Group'] == group] plt.bar(group_data['Order_Index'], group_data['Happiness'], color=color, label=group, alpha=0.85, width=0.8) plt.axhline(df[df['Group']=='Control']['Happiness'].mean(), color='#7F7F7F', linestyle='--', linewidth=2, label='Control Mean') plt.axhline(df[df['Group']=='Intervention']['Happiness'].mean(), color='#2B5B84', linestyle='--', linewidth=2, label='Intervention Mean') plt.title('Individual Average Happiness Across Study Participants', pad=15, fontsize=14, fontweight='bold') plt.xlabel('Individual Participants (Sorted by Happiness Level)', fontsize=12) plt.ylabel('Average Happiness Score', fontsize=12) plt.xticks([]) plt.ylim(1, 10) plt.legend(frameon=True, facecolor='white', fontsize=10, loc='upper left') finish_plot('06_individual_participant_avgs.png') logging.info('Saved study plots to %s', outdir) def main(args): df = load_data(args.data) df = prepare_data(df) descriptive_stats(df) # Effect sizes group0 = df[df['Habits_Count'] == 0]['Happiness'] group3 = df[df['Habits_Count'] == 3]['Happiness'] if len(group0) > 1 and len(group3) > 1: d = cohen_d(group3, group0) print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}") if 'Group' in df.columns: control = df[df['Group'] == 'Control']['Happiness'] intervention = df[df['Group'] == 'Intervention']['Happiness'] if len(control) > 1 and len(intervention) > 1: d_group = cohen_d(intervention, control) print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}") # Models run_ols(df) run_mixedlm(df) # Plots make_plots(df, args.outdir, show_plots=args.show) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv') parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path') parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots') parser.add_argument('--show', action='store_true', help='Show plots interactively') args = parser.parse_args() main(args)