284 lines
No EOL
12 KiB
Python
284 lines
No EOL
12 KiB
Python
import argparse
|
|
import os
|
|
from pathlib import Path
|
|
import logging
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from scipy import stats
|
|
import statsmodels.api as sm
|
|
import statsmodels.formula.api as smf
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
|
|
|
|
|
def load_data(path):
|
|
df = pd.read_csv(path)
|
|
logging.info("Loaded %d rows from %s", len(df), path)
|
|
return df
|
|
|
|
|
|
def prepare_data(df):
|
|
# Ensure required columns exist
|
|
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
|
missing = required - set(df.columns)
|
|
if missing:
|
|
raise KeyError(f"Missing required columns: {missing}")
|
|
|
|
if 'Group' not in df.columns:
|
|
df['Group'] = 'Intervention'
|
|
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
|
|
|
# Normalize adherence to boolean (Yes/No or True/False)
|
|
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
|
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
|
|
|
# Count habits per row
|
|
df['Habits_Count'] = (
|
|
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
|
)
|
|
|
|
# Coerce Happiness to numeric and drop rows without Happiness
|
|
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
|
before = len(df)
|
|
df = df.dropna(subset=['Happiness'])
|
|
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
|
|
|
return df
|
|
|
|
|
|
def descriptive_stats(df):
|
|
print('Dataset shape:', df.shape)
|
|
print('\nOverall summary:')
|
|
print(df['Happiness'].describe())
|
|
|
|
if 'Group' in df.columns:
|
|
print('\nRows by group:')
|
|
print(df['Group'].value_counts())
|
|
|
|
print('\nAverage happiness by group:')
|
|
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
|
|
|
print('\nAverage happiness by number of habits completed:')
|
|
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
|
|
|
print('\nMedian happiness by habits:')
|
|
print(df.groupby('Habits_Count')['Happiness'].median())
|
|
|
|
# Correlations
|
|
print('\nPearson correlation between Habits_Count and Happiness:')
|
|
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
|
|
|
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
|
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
|
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
|
mask = ~habit_df[habit].isna()
|
|
if mask.sum() == 0:
|
|
print(f'{habit:22} (no data)')
|
|
continue
|
|
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
|
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
|
|
|
|
|
def cohen_d(x, y):
|
|
# Cohen's d for two independent samples
|
|
nx, ny = len(x), len(y)
|
|
dof = nx + ny - 2
|
|
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
|
return (x.mean() - y.mean()) / pooled_sd
|
|
|
|
|
|
def run_ols(df):
|
|
if 'Group' in df.columns:
|
|
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
|
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
|
else:
|
|
X = sm.add_constant(df['Habits_Count'])
|
|
y = df['Happiness']
|
|
model = sm.OLS(y, X).fit()
|
|
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
|
print(model.summary())
|
|
return model
|
|
|
|
|
|
def run_mixedlm(df):
|
|
# Random intercept for Participant_ID
|
|
try:
|
|
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
|
mdf = md.fit(reml=False)
|
|
print('\nMixed-effects model (random intercept by Participant_ID):')
|
|
print(mdf.summary())
|
|
return mdf
|
|
except Exception as e:
|
|
logging.warning('MixedLM failed: %s', e)
|
|
return None
|
|
|
|
|
|
def make_plots(df, outdir, show_plots=False):
|
|
outdir = Path(outdir)
|
|
outdir.mkdir(parents=True, exist_ok=True)
|
|
sns.set_theme(style='whitegrid', context='talk')
|
|
|
|
def finish_plot(filename):
|
|
plt.tight_layout()
|
|
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
|
|
if show_plots:
|
|
plt.show()
|
|
plt.close()
|
|
|
|
# 1) PRIMARY OUTCOME: Mean happiness by group with error bars and value labels
|
|
if 'Group' in df.columns:
|
|
plt.figure(figsize=(8, 6))
|
|
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
|
|
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
|
|
bars = plt.bar(
|
|
np.arange(len(summary)),
|
|
summary['mean'].values,
|
|
yerr=ci95.values,
|
|
color=['#A9B2C3', '#4E79A7'],
|
|
capsize=8,
|
|
edgecolor='black',
|
|
linewidth=1.2,
|
|
alpha=0.9
|
|
)
|
|
plt.xticks(np.arange(len(summary)), ['Control Group\n(No habits tracked)', 'Intervention Group\n(Daily habits tracked)'])
|
|
plt.title('Effect of Tracked Organization Habits on Happiness', pad=15, fontsize=14, fontweight='bold')
|
|
plt.ylabel('Mean Daily Happiness Score (1-10)', fontsize=12)
|
|
plt.ylim(1, 10)
|
|
for bar in bars:
|
|
yval = bar.get_height()
|
|
plt.text(bar.get_x() + bar.get_width()/2, yval - 0.8, f'{yval:.1f}', ha='center', va='center', color='white', fontweight='bold', fontsize=11)
|
|
finish_plot('01_primary_outcome_group_comparison.png')
|
|
|
|
# 2) DISTRIBUTIONS: Show overlap and variability in happiness scores
|
|
if 'Group' in df.columns:
|
|
plt.figure(figsize=(9, 6))
|
|
order = ['Control', 'Intervention']
|
|
sns.violinplot(
|
|
data=df, x='Group', y='Happiness', order=order,
|
|
inner='quartile', palette={'Control': '#E0E0E0', 'Intervention': '#B3CDE3'}, cut=0
|
|
)
|
|
sns.stripplot(
|
|
data=df, x='Group', y='Happiness', order=order,
|
|
color='black', alpha=0.12, jitter=0.25, size=3
|
|
)
|
|
plt.title('Distribution of Happiness Reports Over 30 Days', pad=15, fontsize=14, fontweight='bold')
|
|
plt.xlabel('Study Group', fontsize=12)
|
|
plt.ylabel('Happiness Score', fontsize=12)
|
|
plt.ylim(1, 10)
|
|
finish_plot('02_happiness_distribution_by_group.png')
|
|
|
|
# 3) LONGITUDINAL: Daily happiness trend across 30 days
|
|
if 'Group' in df.columns and 'Day' in df.columns:
|
|
plt.figure(figsize=(10, 6))
|
|
daily_mean = df.groupby(['Group', 'Day'])['Happiness'].mean().reset_index()
|
|
sns.lineplot(
|
|
data=daily_mean, x='Day', y='Happiness', hue='Group',
|
|
hue_order=['Control', 'Intervention'],
|
|
palette={'Control': '#7F7F7F', 'Intervention': '#D62728'},
|
|
marker='o', linewidth=2.5, markersize=6
|
|
)
|
|
plt.title('Longitudinal Daily Happiness Throughout the Study', pad=15, fontsize=14, fontweight='bold')
|
|
plt.xlabel('Day of Study (1-30)', fontsize=12)
|
|
plt.ylabel('Average Happiness', fontsize=12)
|
|
plt.ylim(1, 10)
|
|
plt.xticks(range(1, 31, 2))
|
|
plt.legend(title='', frameon=True, facecolor='white', fontsize=10)
|
|
finish_plot('03_longitudinal_trends.png')
|
|
|
|
# 4) DOSE-RESPONSE: In intervention group, does MORE habits = MORE happiness?
|
|
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
|
plt.figure(figsize=(9, 6))
|
|
sns.boxplot(
|
|
data=intervention_df, x='Habits_Count', y='Happiness',
|
|
color='#9ECAE1', width=0.6, fliersize=0
|
|
)
|
|
sns.stripplot(
|
|
data=intervention_df, x='Habits_Count', y='Happiness',
|
|
color='#2B5B84', alpha=0.3, jitter=0.2, size=4
|
|
)
|
|
plt.title('Dose-Response: Happiness by Number of Habits Completed', pad=15, fontsize=14, fontweight='bold')
|
|
plt.xlabel('Number of Requested Habits Completed That Day\n(Calendar + Clean Room + Punctual)', fontsize=11)
|
|
plt.ylabel('Happiness Score', fontsize=12)
|
|
plt.ylim(1, 10)
|
|
finish_plot('04_habit_dose_response.png')
|
|
|
|
# 5) HABIT COMPLETION RATES: Which habits were easiest to maintain?
|
|
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
|
|
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
|
|
adherence_rates.columns = ['Habit', 'Rate']
|
|
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
|
|
plt.figure(figsize=(8, 6))
|
|
bars = sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
|
|
plt.title('Which Habits Were Easiest to Keep?', pad=15, fontsize=14, fontweight='bold')
|
|
plt.xlabel('', fontsize=12)
|
|
plt.ylabel('Percentage of Days Completed', fontsize=12)
|
|
plt.ylim(0, 1.05)
|
|
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
|
|
for bar in bars.patches:
|
|
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
|
|
f"{bar.get_height()*100:.0f}%", ha='center', va='bottom', fontweight='bold', fontsize=10)
|
|
finish_plot('05_habit_completion_rates.png')
|
|
|
|
# 6) INDIVIDUAL VARIATION: Participant-level averages show broad effect
|
|
if 'Group' in df.columns:
|
|
plt.figure(figsize=(12, 6))
|
|
participant_avg = df.groupby(['Group', 'Participant_ID'])['Happiness'].mean().reset_index()
|
|
participant_avg = participant_avg.sort_values(['Group', 'Happiness'])
|
|
participant_avg['Order_Index'] = range(len(participant_avg))
|
|
|
|
for group, color in zip(['Control', 'Intervention'], ['#BDBDBD', '#4E79A7']):
|
|
group_data = participant_avg[participant_avg['Group'] == group]
|
|
plt.bar(group_data['Order_Index'], group_data['Happiness'], color=color, label=group, alpha=0.85, width=0.8)
|
|
|
|
plt.axhline(df[df['Group']=='Control']['Happiness'].mean(), color='#7F7F7F', linestyle='--', linewidth=2, label='Control Mean')
|
|
plt.axhline(df[df['Group']=='Intervention']['Happiness'].mean(), color='#2B5B84', linestyle='--', linewidth=2, label='Intervention Mean')
|
|
plt.title('Individual Average Happiness Across Study Participants', pad=15, fontsize=14, fontweight='bold')
|
|
plt.xlabel('Individual Participants (Sorted by Happiness Level)', fontsize=12)
|
|
plt.ylabel('Average Happiness Score', fontsize=12)
|
|
plt.xticks([])
|
|
plt.ylim(1, 10)
|
|
plt.legend(frameon=True, facecolor='white', fontsize=10, loc='upper left')
|
|
finish_plot('06_individual_participant_avgs.png')
|
|
|
|
logging.info('Saved study plots to %s', outdir)
|
|
|
|
|
|
def main(args):
|
|
df = load_data(args.data)
|
|
df = prepare_data(df)
|
|
|
|
descriptive_stats(df)
|
|
|
|
# Effect sizes
|
|
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
|
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
|
if len(group0) > 1 and len(group3) > 1:
|
|
d = cohen_d(group3, group0)
|
|
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
|
|
|
if 'Group' in df.columns:
|
|
control = df[df['Group'] == 'Control']['Happiness']
|
|
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
|
if len(control) > 1 and len(intervention) > 1:
|
|
d_group = cohen_d(intervention, control)
|
|
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
|
|
|
# Models
|
|
run_ols(df)
|
|
run_mixedlm(df)
|
|
|
|
# Plots
|
|
make_plots(df, args.outdir, show_plots=args.show)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
|
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
|
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
|
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
|
args = parser.parse_args()
|
|
main(args) |