Update Fedora state: 2026-04-29 11:50

2026-04-29 11:50:42 +08:00 · 2026-04-29 11:50:42 +08:00 · 10f0d5de1d
commit 10f0d5de1d
parent 42ca768584
338 changed files with 18983 additions and 32 deletions
--- a/dot_config/private_Code/User/History/7da6e0fb/MtI5.py
+++ b/dot_config/private_Code/User/History/7da6e0fb/MtI5.py
@ -0,0 +1,272 @@
+import argparse
+import os
+from pathlib import Path
+import logging
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+import statsmodels.api as sm
+import statsmodels.formula.api as smf
+
+
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+
+
+def load_data(path):
+    df = pd.read_csv(path)
+    logging.info("Loaded %d rows from %s", len(df), path)
+    return df
+
+
+def prepare_data(df):
+    # Ensure required columns exist
+    required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
+    missing = required - set(df.columns)
+    if missing:
+        raise KeyError(f"Missing required columns: {missing}")
+
+    if 'Group' not in df.columns:
+        df['Group'] = 'Intervention'
+    df['Group'] = df['Group'].astype(str).str.strip().str.title()
+
+    # Normalize adherence to boolean (Yes/No or True/False)
+    for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
+        df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
+
+    # Count habits per row
+    df['Habits_Count'] = (
+        df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
+    )
+
+    # Coerce Happiness to numeric and drop rows without Happiness
+    df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
+    before = len(df)
+    df = df.dropna(subset=['Happiness'])
+    logging.info('Dropped %d rows without numeric Happiness', before - len(df))
+
+    return df
+
+
+def descriptive_stats(df):
+    print('Dataset shape:', df.shape)
+    print('\nOverall summary:')
+    print(df['Happiness'].describe())
+
+    if 'Group' in df.columns:
+        print('\nRows by group:')
+        print(df['Group'].value_counts())
+
+        print('\nAverage happiness by group:')
+        print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
+
+    print('\nAverage happiness by number of habits completed:')
+    print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
+
+    print('\nMedian happiness by habits:')
+    print(df.groupby('Habits_Count')['Happiness'].median())
+
+    # Correlations
+    print('\nPearson correlation between Habits_Count and Happiness:')
+    print(df[['Habits_Count', 'Happiness']].corr().round(3))
+
+    print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
+    habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
+    for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
+        mask = ~habit_df[habit].isna()
+        if mask.sum() == 0:
+            print(f'{habit:22}  (no data)')
+            continue
+        r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
+        print(f"{habit:22}  r = {r:.3f}   p = {p:.4f}")
+
+
+def cohen_d(x, y):
+    # Cohen's d for two independent samples
+    nx, ny = len(x), len(y)
+    dof = nx + ny - 2
+    pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
+    return (x.mean() - y.mean()) / pooled_sd
+
+
+def run_ols(df):
+    if 'Group' in df.columns:
+        model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
+        print('\nOLS regression: Happiness ~ Habits_Count + Group')
+    else:
+        X = sm.add_constant(df['Habits_Count'])
+        y = df['Happiness']
+        model = sm.OLS(y, X).fit()
+        print('\nSimple OLS regression: Happiness ~ Habits_Count')
+    print(model.summary())
+    return model
+
+
+def run_mixedlm(df):
+    # Random intercept for Participant_ID
+    try:
+        md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
+        mdf = md.fit(reml=False)
+        print('\nMixed-effects model (random intercept by Participant_ID):')
+        print(mdf.summary())
+        return mdf
+    except Exception as e:
+        logging.warning('MixedLM failed: %s', e)
+        return None
+
+
+def make_plots(df, outdir, show_plots=False):
+    outdir = Path(outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    sns.set_theme(style='whitegrid', context='talk')
+
+    def finish_plot(filename):
+        plt.tight_layout()
+        plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
+        if show_plots:
+            plt.show()
+        plt.close()
+
+    # 1) Mean happiness by group with error bars
+    if 'Group' in df.columns:
+        plt.figure(figsize=(8, 6))
+        order = ['Control', 'Intervention']
+        sns.barplot(
+            data=df,
+            x='Group',
+            y='Happiness',
+            order=order,
+            estimator='mean',
+            errorbar=('ci', 95),
+            palette=['#7A7A7A', '#2A9D8F'],
+        )
+        plt.title('Average Happiness by Group')
+        plt.xlabel('Study group')
+        plt.ylabel('Mean happiness score')
+        plt.ylim(0, 10)
+        finish_plot('01_mean_happiness_by_group.png')
+
+    # 2) Distribution of happiness by group
+    if 'Group' in df.columns:
+        plt.figure(figsize=(9, 6))
+        order = ['Control', 'Intervention']
+        sns.boxplot(data=df, x='Group', y='Happiness', order=order, palette=['#B0B0B0', '#73C6B6'])
+        sns.stripplot(data=df, x='Group', y='Happiness', order=order, color='black', alpha=0.18, jitter=0.22, size=2)
+        plt.title('Happiness Distribution by Group')
+        plt.xlabel('Study group')
+        plt.ylabel('Happiness score')
+        plt.ylim(0, 10)
+        finish_plot('02_happiness_distribution_by_group.png')
+
+    # 3) Daily happiness trend by group
+    if 'Group' in df.columns and 'Day' in df.columns:
+        daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
+        plt.figure(figsize=(10, 6))
+        sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
+        plt.title('Mean Daily Happiness Across the Study')
+        plt.xlabel('Day of study')
+        plt.ylabel('Average happiness')
+        plt.ylim(0, 10)
+        plt.xticks(range(1, 31, 2))
+        finish_plot('03_daily_happiness_trend.png')
+
+    # 4) Happiness by number of habits in intervention group only
+    intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
+    plt.figure(figsize=(9, 6))
+    sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
+    sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
+    plt.title('Intervention Group: Happiness by Number of Habits Completed')
+    plt.xlabel('Habits completed that day')
+    plt.ylabel('Happiness score')
+    plt.ylim(0, 10)
+    finish_plot('04_happiness_by_habits_intervention.png')
+
+    # 5) Mean happiness by habits count in intervention group
+    habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
+    plt.figure(figsize=(8, 6))
+    sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
+    plt.title('Intervention Group: Mean Happiness vs Habits Completed')
+    plt.xlabel('Number of habits completed')
+    plt.ylabel('Mean happiness')
+    plt.xticks([0, 1, 2, 3])
+    plt.ylim(0, 10)
+    finish_plot('05_mean_happiness_by_habits.png')
+
+    # 6) Habit adherence rates in the intervention group
+    habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
+    adherence_rates = (
+        intervention_df[habit_cols]
+        .mean()
+        .sort_values(ascending=False)
+        .reset_index()
+        .rename(columns={'index': 'Habit', 0: 'Rate'})
+    )
+    adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
+    plt.figure(figsize=(9, 6))
+    sns.barplot(data=adherence_rates, x='Habit', y=0, color='#E76F51')
+    plt.title('Intervention Group: Habit Completion Rate')
+    plt.xlabel('Habit')
+    plt.ylabel('Proportion completed')
+    plt.ylim(0, 1)
+    plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
+    finish_plot('06_habit_completion_rate.png')
+
+    # 7) Participant averages, grouped by study group
+    if 'Group' in df.columns:
+        participant_avg = df.groupby(['Group', 'Participant_ID'])['Happiness'].mean().reset_index()
+        plt.figure(figsize=(12, 6))
+        sns.barplot(
+            data=participant_avg,
+            x='Participant_ID',
+            y='Happiness',
+            hue='Group',
+            dodge=True,
+            palette=['#7A7A7A', '#2A9D8F'],
+        )
+        plt.title('Average Happiness per Participant')
+        plt.xlabel('Participant ID')
+        plt.ylabel('Mean happiness')
+        plt.ylim(0, 10)
+        plt.xticks(rotation=45)
+        finish_plot('07_participant_average_happiness.png')
+
+    logging.info('Saved plots to %s', outdir)
+
+
+def main(args):
+    df = load_data(args.data)
+    df = prepare_data(df)
+
+    descriptive_stats(df)
+
+    # Effect sizes
+    group0 = df[df['Habits_Count'] == 0]['Happiness']
+    group3 = df[df['Habits_Count'] == 3]['Happiness']
+    if len(group0) > 1 and len(group3) > 1:
+        d = cohen_d(group3, group0)
+        print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
+
+    if 'Group' in df.columns:
+        control = df[df['Group'] == 'Control']['Happiness']
+        intervention = df[df['Group'] == 'Intervention']['Happiness']
+        if len(control) > 1 and len(intervention) > 1:
+            d_group = cohen_d(intervention, control)
+            print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
+
+    # Models
+    run_ols(df)
+    run_mixedlm(df)
+
+    # Plots
+    make_plots(df, args.outdir, show_plots=args.show)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
+    parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
+    parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
+    parser.add_argument('--show', action='store_true', help='Show plots interactively')
+    args = parser.parse_args()
+    main(args)