Update Fedora state: 2026-04-29 11:50
This commit is contained in:
parent
42ca768584
commit
10f0d5de1d
338 changed files with 18983 additions and 32 deletions
227
dot_config/private_Code/User/History/7da6e0fb/yfjL.py
Normal file
227
dot_config/private_Code/User/History/7da6e0fb/yfjL.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_style('whitegrid')
|
||||
|
||||
# Boxplot by Habits_Count
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
plt.title('Daily Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Number of habits followed (0–3)')
|
||||
plt.ylabel('Happiness (1–10)')
|
||||
f1 = outdir / 'happiness_by_habits_box.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f1)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Violin / jitter + regression
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.violinplot(data=df, x='Habits_Count', y='Happiness', inner=None, color='#55A868')
|
||||
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
|
||||
plt.title('Happiness distribution by Habits Completed')
|
||||
f2 = outdir / 'happiness_by_habits_violin.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f2)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Participant average bar
|
||||
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
|
||||
plt.figure(figsize=(12, 5))
|
||||
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, color='#C44E52')
|
||||
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
|
||||
plt.xticks(range(len(participant_avg)), participant_avg.index, rotation=45)
|
||||
plt.title('Average Happiness per Participant (sorted)')
|
||||
f3 = outdir / 'participant_avg_happiness.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f3)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(7, 5))
|
||||
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', color='#8172B2')
|
||||
plt.title('Mean Happiness by Group')
|
||||
plt.ylabel('Average happiness')
|
||||
f_group = outdir / 'happiness_by_group.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f_group)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Scatter with linear fit
|
||||
plt.figure(figsize=(9, 6))
|
||||
if 'Group' in df.columns:
|
||||
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
|
||||
else:
|
||||
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
|
||||
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
|
||||
f4 = outdir / 'happiness_vs_habits_regression.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f4)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
Loading…
Add table
Add a link
Reference in a new issue