import argparse import os from pathlib import Path import logging import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats import statsmodels.api as sm import statsmodels.formula.api as smf logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') def load_data(path): df = pd.read_csv(path) logging.info("Loaded %d rows from %s", len(df), path) return df def prepare_data(df): # Ensure required columns exist required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'} missing = required - set(df.columns) if missing: raise KeyError(f"Missing required columns: {missing}") # Normalize adherence to boolean (Yes/No or True/False) for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']: df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False}) # Count habits per row df['Habits_Count'] = ( df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1) ) # Coerce Happiness to numeric and drop rows without Happiness df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce') before = len(df) df = df.dropna(subset=['Happiness']) logging.info('Dropped %d rows without numeric Happiness', before - len(df)) return df def descriptive_stats(df): print('Dataset shape:', df.shape) print('\nOverall summary:') print(df['Happiness'].describe()) print('\nAverage happiness by number of habits completed:') print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3)) print('\nMedian happiness by habits:') print(df.groupby('Habits_Count')['Happiness'].median()) # Correlations print('\nPearson correlation between Habits_Count and Happiness:') print(df[['Habits_Count', 'Happiness']].corr().round(3)) print('\nPoint-biserial correlation (each habit vs happiness):') for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']: mask = ~df[habit].isna() if mask.sum() == 0: print(f'{habit:22} (no data)') continue r, p = stats.pointbiserialr(df.loc[mask, habit].astype(int), df.loc[mask, 'Happiness']) print(f"{habit:22} r = {r:.3f} p = {p:.4f}") def cohen_d(x, y): # Cohen's d for two independent samples nx, ny = len(x), len(y) dof = nx + ny - 2 pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof) return (x.mean() - y.mean()) / pooled_sd def run_ols(df): X = sm.add_constant(df['Habits_Count']) y = df['Happiness'] model = sm.OLS(y, X).fit() print('\nSimple OLS regression: Happiness ~ Habits_Count') print(model.summary()) return model def run_mixedlm(df): # Random intercept for Participant_ID try: md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID']) mdf = md.fit(reml=False) print('\nMixed-effects model (random intercept by Participant_ID):') print(mdf.summary()) return mdf except Exception as e: logging.warning('MixedLM failed: %s', e) return None def make_plots(df, outdir, show_plots=False): outdir = Path(outdir) outdir.mkdir(parents=True, exist_ok=True) sns.set_style('whitegrid') # Boxplot by Habits_Count plt.figure(figsize=(9, 6)) sns.boxplot(x='Habits_Count', y='Happiness', data=df, palette='viridis') plt.title('Daily Happiness by Number of Habits Completed') plt.xlabel('Number of habits followed (0–3)') plt.ylabel('Happiness (1–10)') f1 = outdir / 'happiness_by_habits_box.png' plt.tight_layout() plt.savefig(f1) if show_plots: plt.show() plt.close() # Violin / jitter + regression plt.figure(figsize=(9, 6)) sns.violinplot(x='Habits_Count', y='Happiness', data=df, inner=None, palette='muted') sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15) plt.title('Happiness distribution by Habits Completed') f2 = outdir / 'happiness_by_habits_violin.png' plt.tight_layout() plt.savefig(f2) if show_plots: plt.show() plt.close() # Participant average bar participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values() plt.figure(figsize=(12, 5)) sns.barplot(x=participant_avg.index.astype(str), y=participant_avg.values, palette='coolwarm') plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6) plt.xticks(rotation=45) plt.title('Average Happiness per Participant (sorted)') f3 = outdir / 'participant_avg_happiness.png' plt.tight_layout() plt.savefig(f3) if show_plots: plt.show() plt.close() # Scatter with linear fit plt.figure(figsize=(9, 6)) sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4}) plt.title('Happiness vs Number of Habits Completed (with linear fit)') f4 = outdir / 'happiness_vs_habits_regression.png' plt.tight_layout() plt.savefig(f4) if show_plots: plt.show() plt.close() logging.info('Saved plots to %s', outdir) def main(args): df = load_data(args.data) df = prepare_data(df) descriptive_stats(df) # Effect size example: compare 0 vs 3 group0 = df[df['Habits_Count'] == 0]['Happiness'] group3 = df[df['Habits_Count'] == 3]['Happiness'] if len(group0) > 1 and len(group3) > 1: d = cohen_d(group3, group0) print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}") # Models run_ols(df) run_mixedlm(df) # Plots make_plots(df, args.outdir, show_plots=args.show) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv') parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path') parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots') parser.add_argument('--show', action='store_true', help='Show plots interactively') args = parser.parse_args() main(args)