dotfiles/dot_config/private_Code/User/History/7da6e0fb/SA9R.py
2026-04-29 11:50:42 +08:00

189 lines
No EOL
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness):')
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(df.loc[mask, habit].astype(int), df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_style('whitegrid')
# Boxplot by Habits_Count
plt.figure(figsize=(9, 6))
sns.boxplot(x='Habits_Count', y='Happiness', data=df, palette='viridis')
plt.title('Daily Happiness by Number of Habits Completed')
plt.xlabel('Number of habits followed (03)')
plt.ylabel('Happiness (110)')
f1 = outdir / 'happiness_by_habits_box.png'
plt.tight_layout()
plt.savefig(f1)
if show_plots:
plt.show()
plt.close()
# Violin / jitter + regression
plt.figure(figsize=(9, 6))
sns.violinplot(x='Habits_Count', y='Happiness', data=df, inner=None, palette='muted')
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
plt.title('Happiness distribution by Habits Completed')
f2 = outdir / 'happiness_by_habits_violin.png'
plt.tight_layout()
plt.savefig(f2)
if show_plots:
plt.show()
plt.close()
# Participant average bar
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
plt.figure(figsize=(12, 5))
sns.barplot(x=participant_avg.index.astype(str), y=participant_avg.values, palette='coolwarm')
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
plt.xticks(rotation=45)
plt.title('Average Happiness per Participant (sorted)')
f3 = outdir / 'participant_avg_happiness.png'
plt.tight_layout()
plt.savefig(f3)
if show_plots:
plt.show()
plt.close()
# Scatter with linear fit
plt.figure(figsize=(9, 6))
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
f4 = outdir / 'happiness_vs_habits_regression.png'
plt.tight_layout()
plt.savefig(f4)
if show_plots:
plt.show()
plt.close()
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect size example: compare 0 vs 3
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)