Update Fedora state: 2026-04-29 11:50
This commit is contained in:
parent
42ca768584
commit
10f0d5de1d
338 changed files with 18983 additions and 32 deletions
284
dot_config/private_Code/User/History/7da6e0fb/9KVj.py
Normal file
284
dot_config/private_Code/User/History/7da6e0fb/9KVj.py
Normal file
|
|
@ -0,0 +1,284 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_theme(style='whitegrid', context='talk')
|
||||
|
||||
def finish_plot(filename):
|
||||
plt.tight_layout()
|
||||
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# 1) PRIMARY OUTCOME: Mean happiness by group with error bars and value labels
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(8, 6))
|
||||
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
|
||||
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
|
||||
bars = plt.bar(
|
||||
np.arange(len(summary)),
|
||||
summary['mean'].values,
|
||||
yerr=ci95.values,
|
||||
color=['#A9B2C3', '#4E79A7'],
|
||||
capsize=8,
|
||||
edgecolor='black',
|
||||
linewidth=1.2,
|
||||
alpha=0.9
|
||||
)
|
||||
plt.xticks(np.arange(len(summary)), ['Control Group\n(No habits tracked)', 'Intervention Group\n(Daily habits tracked)'])
|
||||
plt.title('Effect of Tracked Organization Habits on Happiness', pad=15, fontsize=14, fontweight='bold')
|
||||
plt.ylabel('Mean Daily Happiness Score (1-10)', fontsize=12)
|
||||
plt.ylim(1, 10)
|
||||
for bar in bars:
|
||||
yval = bar.get_height()
|
||||
plt.text(bar.get_x() + bar.get_width()/2, yval - 0.8, f'{yval:.1f}', ha='center', va='center', color='white', fontweight='bold', fontsize=11)
|
||||
finish_plot('01_primary_outcome_group_comparison.png')
|
||||
|
||||
# 2) DISTRIBUTIONS: Show overlap and variability in happiness scores
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(9, 6))
|
||||
order = ['Control', 'Intervention']
|
||||
sns.violinplot(
|
||||
data=df, x='Group', y='Happiness', order=order,
|
||||
inner='quartile', palette={'Control': '#E0E0E0', 'Intervention': '#B3CDE3'}, cut=0
|
||||
)
|
||||
sns.stripplot(
|
||||
data=df, x='Group', y='Happiness', order=order,
|
||||
color='black', alpha=0.12, jitter=0.25, size=3
|
||||
)
|
||||
plt.title('Distribution of Happiness Reports Over 30 Days', pad=15, fontsize=14, fontweight='bold')
|
||||
plt.xlabel('Study Group', fontsize=12)
|
||||
plt.ylabel('Happiness Score', fontsize=12)
|
||||
plt.ylim(1, 10)
|
||||
finish_plot('02_happiness_distribution_by_group.png')
|
||||
|
||||
# 3) LONGITUDINAL: Daily happiness trend across 30 days
|
||||
if 'Group' in df.columns and 'Day' in df.columns:
|
||||
plt.figure(figsize=(10, 6))
|
||||
daily_mean = df.groupby(['Group', 'Day'])['Happiness'].mean().reset_index()
|
||||
sns.lineplot(
|
||||
data=daily_mean, x='Day', y='Happiness', hue='Group',
|
||||
hue_order=['Control', 'Intervention'],
|
||||
palette={'Control': '#7F7F7F', 'Intervention': '#D62728'},
|
||||
marker='o', linewidth=2.5, markersize=6
|
||||
)
|
||||
plt.title('Longitudinal Daily Happiness Throughout the Study', pad=15, fontsize=14, fontweight='bold')
|
||||
plt.xlabel('Day of Study (1-30)', fontsize=12)
|
||||
plt.ylabel('Average Happiness', fontsize=12)
|
||||
plt.ylim(1, 10)
|
||||
plt.xticks(range(1, 31, 2))
|
||||
plt.legend(title='', frameon=True, facecolor='white', fontsize=10)
|
||||
finish_plot('03_longitudinal_trends.png')
|
||||
|
||||
# 4) DOSE-RESPONSE: In intervention group, does MORE habits = MORE happiness?
|
||||
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(
|
||||
data=intervention_df, x='Habits_Count', y='Happiness',
|
||||
color='#9ECAE1', width=0.6, fliersize=0
|
||||
)
|
||||
sns.stripplot(
|
||||
data=intervention_df, x='Habits_Count', y='Happiness',
|
||||
color='#2B5B84', alpha=0.3, jitter=0.2, size=4
|
||||
)
|
||||
plt.title('Dose-Response: Happiness by Number of Habits Completed', pad=15, fontsize=14, fontweight='bold')
|
||||
plt.xlabel('Number of Requested Habits Completed That Day\n(Calendar + Clean Room + Punctual)', fontsize=11)
|
||||
plt.ylabel('Happiness Score', fontsize=12)
|
||||
plt.ylim(1, 10)
|
||||
finish_plot('04_habit_dose_response.png')
|
||||
|
||||
# 5) HABIT COMPLETION RATES: Which habits were easiest to maintain?
|
||||
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
|
||||
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
|
||||
adherence_rates.columns = ['Habit', 'Rate']
|
||||
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
|
||||
plt.figure(figsize=(8, 6))
|
||||
bars = sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
|
||||
plt.title('Which Habits Were Easiest to Keep?', pad=15, fontsize=14, fontweight='bold')
|
||||
plt.xlabel('', fontsize=12)
|
||||
plt.ylabel('Percentage of Days Completed', fontsize=12)
|
||||
plt.ylim(0, 1.05)
|
||||
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
|
||||
for bar in bars.patches:
|
||||
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
|
||||
f"{bar.get_height()*100:.0f}%", ha='center', va='bottom', fontweight='bold', fontsize=10)
|
||||
finish_plot('05_habit_completion_rates.png')
|
||||
|
||||
# 6) INDIVIDUAL VARIATION: Participant-level averages show broad effect
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(12, 6))
|
||||
participant_avg = df.groupby(['Group', 'Participant_ID'])['Happiness'].mean().reset_index()
|
||||
participant_avg = participant_avg.sort_values(['Group', 'Happiness'])
|
||||
participant_avg['Order_Index'] = range(len(participant_avg))
|
||||
|
||||
for group, color in zip(['Control', 'Intervention'], ['#BDBDBD', '#4E79A7']):
|
||||
group_data = participant_avg[participant_avg['Group'] == group]
|
||||
plt.bar(group_data['Order_Index'], group_data['Happiness'], color=color, label=group, alpha=0.85, width=0.8)
|
||||
|
||||
plt.axhline(df[df['Group']=='Control']['Happiness'].mean(), color='#7F7F7F', linestyle='--', linewidth=2, label='Control Mean')
|
||||
plt.axhline(df[df['Group']=='Intervention']['Happiness'].mean(), color='#2B5B84', linestyle='--', linewidth=2, label='Intervention Mean')
|
||||
plt.title('Individual Average Happiness Across Study Participants', pad=15, fontsize=14, fontweight='bold')
|
||||
plt.xlabel('Individual Participants (Sorted by Happiness Level)', fontsize=12)
|
||||
plt.ylabel('Average Happiness Score', fontsize=12)
|
||||
plt.xticks([])
|
||||
plt.ylim(1, 10)
|
||||
plt.legend(frameon=True, facecolor='white', fontsize=10, loc='upper left')
|
||||
finish_plot('06_individual_participant_avgs.png')
|
||||
|
||||
logging.info('Saved study plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
227
dot_config/private_Code/User/History/7da6e0fb/FOyN.py
Normal file
227
dot_config/private_Code/User/History/7da6e0fb/FOyN.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_style('whitegrid')
|
||||
|
||||
# Boxplot by Habits_Count
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
plt.title('Daily Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Number of habits followed (0–3)')
|
||||
plt.ylabel('Happiness (1–10)')
|
||||
f1 = outdir / 'happiness_by_habits_box.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f1)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Violin / jitter + regression
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.violinplot(data=df, x='Habits_Count', y='Happiness', inner=None, color='#55A868')
|
||||
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
|
||||
plt.title('Happiness distribution by Habits Completed')
|
||||
f2 = outdir / 'happiness_by_habits_violin.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f2)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Participant average bar
|
||||
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
|
||||
plt.figure(figsize=(12, 5))
|
||||
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, color='#C44E52')
|
||||
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
|
||||
plt.xticks(range(len(participant_avg)), participant_avg.index, rotation=45)
|
||||
plt.title('Average Happiness per Participant (sorted)')
|
||||
f3 = outdir / 'participant_avg_happiness.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f3)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(7, 5))
|
||||
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', color='#8172B2')
|
||||
plt.title('Mean Happiness by Group')
|
||||
plt.ylabel('Average happiness')
|
||||
f_group = outdir / 'happiness_by_group.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f_group)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Scatter with linear fit
|
||||
plt.figure(figsize=(9, 6))
|
||||
if 'Group' in df.columns:
|
||||
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
|
||||
else:
|
||||
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
|
||||
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
|
||||
f4 = outdir / 'happiness_vs_habits_regression.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f4)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
227
dot_config/private_Code/User/History/7da6e0fb/Gx76.py
Normal file
227
dot_config/private_Code/User/History/7da6e0fb/Gx76.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_style('whitegrid')
|
||||
|
||||
# Boxplot by Habits_Count
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
plt.title('Daily Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Number of habits followed (0–3)')
|
||||
plt.ylabel('Happiness (1–10)')
|
||||
f1 = outdir / 'happiness_by_habits_box.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f1)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Violin / jitter + regression
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.violinplot(data=df, x='Habits_Count', y='Happiness', inner=None, color='#55A868')
|
||||
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
|
||||
plt.title('Happiness distribution by Habits Completed')
|
||||
f2 = outdir / 'happiness_by_habits_violin.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f2)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Participant average bar
|
||||
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
|
||||
plt.figure(figsize=(12, 5))
|
||||
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, color='#C44E52')
|
||||
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
|
||||
plt.xticks(range(len(participant_avg)), participant_avg.index.astype(str), rotation=45)
|
||||
plt.title('Average Happiness per Participant (sorted)')
|
||||
f3 = outdir / 'participant_avg_happiness.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f3)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(7, 5))
|
||||
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', color='#8172B2')
|
||||
plt.title('Mean Happiness by Group')
|
||||
plt.ylabel('Average happiness')
|
||||
f_group = outdir / 'happiness_by_group.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f_group)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Scatter with linear fit
|
||||
plt.figure(figsize=(9, 6))
|
||||
if 'Group' in df.columns:
|
||||
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
|
||||
else:
|
||||
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
|
||||
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
|
||||
f4 = outdir / 'happiness_vs_habits_regression.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f4)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
253
dot_config/private_Code/User/History/7da6e0fb/Ldgu.py
Normal file
253
dot_config/private_Code/User/History/7da6e0fb/Ldgu.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_theme(style='whitegrid', context='talk')
|
||||
|
||||
def finish_plot(filename):
|
||||
plt.tight_layout()
|
||||
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# 1) Mean happiness by group with error bars
|
||||
if 'Group' in df.columns:
|
||||
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
|
||||
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.bar(summary.index, summary['mean'], color=['#7A7A7A', '#2A9D8F'], yerr=ci95, capsize=6)
|
||||
plt.title('Average Happiness by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Mean happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('01_mean_happiness_by_group.png')
|
||||
|
||||
# 2) Distribution of happiness by group
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(9, 6))
|
||||
order = ['Control', 'Intervention']
|
||||
sns.boxplot(data=df, x='Group', y='Happiness', order=order, color='#C9D1D9')
|
||||
sns.stripplot(data=df, x='Group', y='Happiness', order=order, color='black', alpha=0.18, jitter=0.22, size=2)
|
||||
plt.title('Happiness Distribution by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('02_happiness_distribution_by_group.png')
|
||||
|
||||
# 3) Daily happiness trend by group
|
||||
if 'Group' in df.columns and 'Day' in df.columns:
|
||||
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
|
||||
plt.title('Mean Daily Happiness Across the Study')
|
||||
plt.xlabel('Day of study')
|
||||
plt.ylabel('Average happiness')
|
||||
plt.ylim(0, 10)
|
||||
plt.xticks(range(1, 31, 2))
|
||||
finish_plot('03_daily_happiness_trend.png')
|
||||
|
||||
# 4) Happiness by number of habits in intervention group only
|
||||
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
|
||||
plt.title('Intervention Group: Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Habits completed that day')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('04_happiness_by_habits_intervention.png')
|
||||
|
||||
# 5) Mean happiness by habits count in intervention group
|
||||
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
|
||||
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
|
||||
plt.xlabel('Number of habits completed')
|
||||
plt.ylabel('Mean happiness')
|
||||
plt.xticks([0, 1, 2, 3])
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('05_mean_happiness_by_habits.png')
|
||||
|
||||
# 6) Habit adherence rates in the intervention group
|
||||
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
|
||||
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
|
||||
adherence_rates.columns = ['Habit', 'Rate']
|
||||
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
|
||||
plt.title('Intervention Group: Habit Completion Rate')
|
||||
plt.xlabel('Habit')
|
||||
plt.ylabel('Proportion completed')
|
||||
plt.ylim(0, 1)
|
||||
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
|
||||
finish_plot('06_habit_completion_rate.png')
|
||||
|
||||
# 7) Participant average happiness by group
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(12, 6))
|
||||
participant_avg = df.groupby(['Group', 'Participant_ID'], as_index=False)['Happiness'].mean()
|
||||
sns.boxplot(data=participant_avg, x='Group', y='Happiness', order=['Control', 'Intervention'], color='#D6D6D6')
|
||||
sns.stripplot(data=participant_avg, x='Group', y='Happiness', order=['Control', 'Intervention'], color='black', alpha=0.45, jitter=0.12, size=5)
|
||||
plt.title('Average Happiness per Participant')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Participant mean happiness')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('07_participant_average_happiness.png')
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
272
dot_config/private_Code/User/History/7da6e0fb/MtI5.py
Normal file
272
dot_config/private_Code/User/History/7da6e0fb/MtI5.py
Normal file
|
|
@ -0,0 +1,272 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_theme(style='whitegrid', context='talk')
|
||||
|
||||
def finish_plot(filename):
|
||||
plt.tight_layout()
|
||||
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# 1) Mean happiness by group with error bars
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(8, 6))
|
||||
order = ['Control', 'Intervention']
|
||||
sns.barplot(
|
||||
data=df,
|
||||
x='Group',
|
||||
y='Happiness',
|
||||
order=order,
|
||||
estimator='mean',
|
||||
errorbar=('ci', 95),
|
||||
palette=['#7A7A7A', '#2A9D8F'],
|
||||
)
|
||||
plt.title('Average Happiness by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Mean happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('01_mean_happiness_by_group.png')
|
||||
|
||||
# 2) Distribution of happiness by group
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(9, 6))
|
||||
order = ['Control', 'Intervention']
|
||||
sns.boxplot(data=df, x='Group', y='Happiness', order=order, palette=['#B0B0B0', '#73C6B6'])
|
||||
sns.stripplot(data=df, x='Group', y='Happiness', order=order, color='black', alpha=0.18, jitter=0.22, size=2)
|
||||
plt.title('Happiness Distribution by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('02_happiness_distribution_by_group.png')
|
||||
|
||||
# 3) Daily happiness trend by group
|
||||
if 'Group' in df.columns and 'Day' in df.columns:
|
||||
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
|
||||
plt.title('Mean Daily Happiness Across the Study')
|
||||
plt.xlabel('Day of study')
|
||||
plt.ylabel('Average happiness')
|
||||
plt.ylim(0, 10)
|
||||
plt.xticks(range(1, 31, 2))
|
||||
finish_plot('03_daily_happiness_trend.png')
|
||||
|
||||
# 4) Happiness by number of habits in intervention group only
|
||||
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
|
||||
plt.title('Intervention Group: Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Habits completed that day')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('04_happiness_by_habits_intervention.png')
|
||||
|
||||
# 5) Mean happiness by habits count in intervention group
|
||||
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
|
||||
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
|
||||
plt.xlabel('Number of habits completed')
|
||||
plt.ylabel('Mean happiness')
|
||||
plt.xticks([0, 1, 2, 3])
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('05_mean_happiness_by_habits.png')
|
||||
|
||||
# 6) Habit adherence rates in the intervention group
|
||||
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
|
||||
adherence_rates = (
|
||||
intervention_df[habit_cols]
|
||||
.mean()
|
||||
.sort_values(ascending=False)
|
||||
.reset_index()
|
||||
.rename(columns={'index': 'Habit', 0: 'Rate'})
|
||||
)
|
||||
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.barplot(data=adherence_rates, x='Habit', y=0, color='#E76F51')
|
||||
plt.title('Intervention Group: Habit Completion Rate')
|
||||
plt.xlabel('Habit')
|
||||
plt.ylabel('Proportion completed')
|
||||
plt.ylim(0, 1)
|
||||
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
|
||||
finish_plot('06_habit_completion_rate.png')
|
||||
|
||||
# 7) Participant averages, grouped by study group
|
||||
if 'Group' in df.columns:
|
||||
participant_avg = df.groupby(['Group', 'Participant_ID'])['Happiness'].mean().reset_index()
|
||||
plt.figure(figsize=(12, 6))
|
||||
sns.barplot(
|
||||
data=participant_avg,
|
||||
x='Participant_ID',
|
||||
y='Happiness',
|
||||
hue='Group',
|
||||
dodge=True,
|
||||
palette=['#7A7A7A', '#2A9D8F'],
|
||||
)
|
||||
plt.title('Average Happiness per Participant')
|
||||
plt.xlabel('Participant ID')
|
||||
plt.ylabel('Mean happiness')
|
||||
plt.ylim(0, 10)
|
||||
plt.xticks(rotation=45)
|
||||
finish_plot('07_participant_average_happiness.png')
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
270
dot_config/private_Code/User/History/7da6e0fb/NtsI.py
Normal file
270
dot_config/private_Code/User/History/7da6e0fb/NtsI.py
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_theme(style='whitegrid', context='talk')
|
||||
|
||||
def finish_plot(filename):
|
||||
plt.tight_layout()
|
||||
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# 1) Mean happiness by group with error bars
|
||||
if 'Group' in df.columns:
|
||||
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
|
||||
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
|
||||
plt.figure(figsize=(8, 6))
|
||||
xpos = np.arange(len(summary))
|
||||
plt.bar(xpos, summary['mean'].values, color=['#7A7A7A', '#2A9D8F'], yerr=ci95.values, capsize=6)
|
||||
plt.xticks(xpos, summary.index)
|
||||
plt.title('Average Happiness by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Mean happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('01_mean_happiness_by_group.png')
|
||||
|
||||
# 2) Distribution of happiness by group
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(9, 6))
|
||||
order = ['Control', 'Intervention']
|
||||
grouped = [df.loc[df['Group'] == group, 'Happiness'].values for group in order]
|
||||
plt.boxplot(grouped, labels=order, patch_artist=True,
|
||||
boxprops=dict(facecolor='#C9D1D9', color='#4C4C4C'),
|
||||
medianprops=dict(color='#2A9D8F', linewidth=2),
|
||||
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
|
||||
for i, group in enumerate(order, start=1):
|
||||
y = df.loc[df['Group'] == group, 'Happiness'].values
|
||||
x = np.random.normal(i, 0.06, size=len(y))
|
||||
plt.scatter(x, y, color='black', alpha=0.15, s=10)
|
||||
plt.title('Happiness Distribution by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('02_happiness_distribution_by_group.png')
|
||||
|
||||
# 3) Daily happiness trend by group
|
||||
if 'Group' in df.columns and 'Day' in df.columns:
|
||||
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
|
||||
plt.title('Mean Daily Happiness Across the Study')
|
||||
plt.xlabel('Day of study')
|
||||
plt.ylabel('Average happiness')
|
||||
plt.ylim(0, 10)
|
||||
plt.xticks(range(1, 31, 2))
|
||||
finish_plot('03_daily_happiness_trend.png')
|
||||
|
||||
# 4) Happiness by number of habits in intervention group only
|
||||
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
|
||||
plt.title('Intervention Group: Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Habits completed that day')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('04_happiness_by_habits_intervention.png')
|
||||
|
||||
# 5) Mean happiness by habits count in intervention group
|
||||
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
|
||||
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
|
||||
plt.xlabel('Number of habits completed')
|
||||
plt.ylabel('Mean happiness')
|
||||
plt.xticks([0, 1, 2, 3])
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('05_mean_happiness_by_habits.png')
|
||||
|
||||
# 6) Habit adherence rates in the intervention group
|
||||
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
|
||||
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
|
||||
adherence_rates.columns = ['Habit', 'Rate']
|
||||
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
|
||||
plt.title('Intervention Group: Habit Completion Rate')
|
||||
plt.xlabel('Habit')
|
||||
plt.ylabel('Proportion completed')
|
||||
plt.ylim(0, 1)
|
||||
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
|
||||
finish_plot('06_habit_completion_rate.png')
|
||||
|
||||
# 7) Participant average happiness by group
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(12, 6))
|
||||
participant_avg = df.groupby(['Group', 'Participant_ID'], as_index=False)['Happiness'].mean()
|
||||
group_order = ['Control', 'Intervention']
|
||||
grouped_avgs = [participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values for group in group_order]
|
||||
plt.boxplot(grouped_avgs, labels=group_order, patch_artist=True,
|
||||
boxprops=dict(facecolor='#D6D6D6', color='#4C4C4C'),
|
||||
medianprops=dict(color='#2A9D8F', linewidth=2),
|
||||
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
|
||||
for i, group in enumerate(group_order, start=1):
|
||||
y = participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values
|
||||
x = np.random.normal(i, 0.06, size=len(y))
|
||||
plt.scatter(x, y, color='black', alpha=0.45, s=22)
|
||||
plt.title('Average Happiness per Participant')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Participant mean happiness')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('07_participant_average_happiness.png')
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
189
dot_config/private_Code/User/History/7da6e0fb/SA9R.py
Normal file
189
dot_config/private_Code/User/History/7da6e0fb/SA9R.py
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness):')
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(df.loc[mask, habit].astype(int), df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_style('whitegrid')
|
||||
|
||||
# Boxplot by Habits_Count
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(x='Habits_Count', y='Happiness', data=df, palette='viridis')
|
||||
plt.title('Daily Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Number of habits followed (0–3)')
|
||||
plt.ylabel('Happiness (1–10)')
|
||||
f1 = outdir / 'happiness_by_habits_box.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f1)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Violin / jitter + regression
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.violinplot(x='Habits_Count', y='Happiness', data=df, inner=None, palette='muted')
|
||||
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
|
||||
plt.title('Happiness distribution by Habits Completed')
|
||||
f2 = outdir / 'happiness_by_habits_violin.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f2)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Participant average bar
|
||||
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
|
||||
plt.figure(figsize=(12, 5))
|
||||
sns.barplot(x=participant_avg.index.astype(str), y=participant_avg.values, palette='coolwarm')
|
||||
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
|
||||
plt.xticks(rotation=45)
|
||||
plt.title('Average Happiness per Participant (sorted)')
|
||||
f3 = outdir / 'participant_avg_happiness.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f3)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Scatter with linear fit
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
|
||||
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
|
||||
f4 = outdir / 'happiness_vs_habits_regression.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f4)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect size example: compare 0 vs 3
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
231
dot_config/private_Code/User/History/7da6e0fb/bwYb.py
Normal file
231
dot_config/private_Code/User/History/7da6e0fb/bwYb.py
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_style('whitegrid')
|
||||
|
||||
# Boxplot by Habits_Count
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=df, x='Habits_Count', y='Happiness', hue='Habits_Count', palette='viridis', dodge=False)
|
||||
plt.legend([], [], frameon=False)
|
||||
plt.title('Daily Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Number of habits followed (0–3)')
|
||||
plt.ylabel('Happiness (1–10)')
|
||||
f1 = outdir / 'happiness_by_habits_box.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f1)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Violin / jitter + regression
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.violinplot(data=df, x='Habits_Count', y='Happiness', hue='Habits_Count', inner=None, palette='muted', dodge=False)
|
||||
plt.legend([], [], frameon=False)
|
||||
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
|
||||
plt.title('Happiness distribution by Habits Completed')
|
||||
f2 = outdir / 'happiness_by_habits_violin.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f2)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Participant average bar
|
||||
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
|
||||
plt.figure(figsize=(12, 5))
|
||||
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, hue=range(len(participant_avg)), palette='coolwarm', dodge=False)
|
||||
plt.legend([], [], frameon=False)
|
||||
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
|
||||
plt.xticks(range(len(participant_avg)), participant_avg.index.astype(str), rotation=45)
|
||||
plt.title('Average Happiness per Participant (sorted)')
|
||||
f3 = outdir / 'participant_avg_happiness.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f3)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(7, 5))
|
||||
sns.barplot(data=df, x='Group', y='Happiness', hue='Group', estimator='mean', errorbar='sd', palette='Set2', dodge=False)
|
||||
plt.legend([], [], frameon=False)
|
||||
plt.title('Mean Happiness by Group')
|
||||
plt.ylabel('Average happiness')
|
||||
f_group = outdir / 'happiness_by_group.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f_group)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Scatter with linear fit
|
||||
plt.figure(figsize=(9, 6))
|
||||
if 'Group' in df.columns:
|
||||
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
|
||||
else:
|
||||
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
|
||||
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
|
||||
f4 = outdir / 'happiness_vs_habits_regression.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f4)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
270
dot_config/private_Code/User/History/7da6e0fb/enQE.py
Normal file
270
dot_config/private_Code/User/History/7da6e0fb/enQE.py
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_theme(style='whitegrid', context='talk')
|
||||
|
||||
def finish_plot(filename):
|
||||
plt.tight_layout()
|
||||
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# 1) Mean happiness by group with error bars
|
||||
if 'Group' in df.columns:
|
||||
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
|
||||
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
|
||||
plt.figure(figsize=(8, 6))
|
||||
xpos = np.arange(len(summary))
|
||||
plt.bar(xpos, summary['mean'].values, color=['#7A7A7A', '#2A9D8F'], yerr=ci95.values, capsize=6)
|
||||
plt.xticks(xpos, summary.index)
|
||||
plt.title('Average Happiness by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Mean happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('01_mean_happiness_by_group.png')
|
||||
|
||||
# 2) Distribution of happiness by group
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(9, 6))
|
||||
order = ['Control', 'Intervention']
|
||||
grouped = [df.loc[df['Group'] == group, 'Happiness'].values for group in order]
|
||||
plt.boxplot(grouped, tick_labels=order, patch_artist=True,
|
||||
boxprops=dict(facecolor='#C9D1D9', color='#4C4C4C'),
|
||||
medianprops=dict(color='#2A9D8F', linewidth=2),
|
||||
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
|
||||
for i, group in enumerate(order, start=1):
|
||||
y = df.loc[df['Group'] == group, 'Happiness'].values
|
||||
x = np.random.normal(i, 0.06, size=len(y))
|
||||
plt.scatter(x, y, color='black', alpha=0.15, s=10)
|
||||
plt.title('Happiness Distribution by Group')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('02_happiness_distribution_by_group.png')
|
||||
|
||||
# 3) Daily happiness trend by group
|
||||
if 'Group' in df.columns and 'Day' in df.columns:
|
||||
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
|
||||
plt.title('Mean Daily Happiness Across the Study')
|
||||
plt.xlabel('Day of study')
|
||||
plt.ylabel('Average happiness')
|
||||
plt.ylim(0, 10)
|
||||
plt.xticks(range(1, 31, 2))
|
||||
finish_plot('03_daily_happiness_trend.png')
|
||||
|
||||
# 4) Happiness by number of habits in intervention group only
|
||||
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
|
||||
plt.title('Intervention Group: Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Habits completed that day')
|
||||
plt.ylabel('Happiness score')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('04_happiness_by_habits_intervention.png')
|
||||
|
||||
# 5) Mean happiness by habits count in intervention group
|
||||
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
|
||||
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
|
||||
plt.xlabel('Number of habits completed')
|
||||
plt.ylabel('Mean happiness')
|
||||
plt.xticks([0, 1, 2, 3])
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('05_mean_happiness_by_habits.png')
|
||||
|
||||
# 6) Habit adherence rates in the intervention group
|
||||
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
|
||||
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
|
||||
adherence_rates.columns = ['Habit', 'Rate']
|
||||
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
|
||||
plt.title('Intervention Group: Habit Completion Rate')
|
||||
plt.xlabel('Habit')
|
||||
plt.ylabel('Proportion completed')
|
||||
plt.ylim(0, 1)
|
||||
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
|
||||
finish_plot('06_habit_completion_rate.png')
|
||||
|
||||
# 7) Participant average happiness by group
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(12, 6))
|
||||
participant_avg = df.groupby(['Group', 'Participant_ID'], as_index=False)['Happiness'].mean()
|
||||
group_order = ['Control', 'Intervention']
|
||||
grouped_avgs = [participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values for group in group_order]
|
||||
plt.boxplot(grouped_avgs, tick_labels=group_order, patch_artist=True,
|
||||
boxprops=dict(facecolor='#D6D6D6', color='#4C4C4C'),
|
||||
medianprops=dict(color='#2A9D8F', linewidth=2),
|
||||
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
|
||||
for i, group in enumerate(group_order, start=1):
|
||||
y = participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values
|
||||
x = np.random.normal(i, 0.06, size=len(y))
|
||||
plt.scatter(x, y, color='black', alpha=0.45, s=22)
|
||||
plt.title('Average Happiness per Participant')
|
||||
plt.xlabel('Study group')
|
||||
plt.ylabel('Participant mean happiness')
|
||||
plt.ylim(0, 10)
|
||||
finish_plot('07_participant_average_happiness.png')
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
|
@ -0,0 +1 @@
|
|||
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/Data%20Analysis.py","entries":[{"id":"SA9R.py","source":"Chat Edit: 'improve on this analysis script'","timestamp":1774345116327},{"id":"ycv3.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345356264},{"id":"bwYb.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345411358},{"id":"Gx76.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345436946},{"id":"FOyN.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345501736},{"id":"MtI5.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346145201},{"id":"Ldgu.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346200970},{"id":"NtsI.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346222014},{"id":"enQE.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346258056},{"id":"yfjL.py","timestamp":1774346751804},{"id":"9KVj.py","source":"Chat Edit: 'ensure the graphs being used are appropriate for the study'","timestamp":1774346803522}]}
|
||||
227
dot_config/private_Code/User/History/7da6e0fb/ycv3.py
Normal file
227
dot_config/private_Code/User/History/7da6e0fb/ycv3.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_style('whitegrid')
|
||||
|
||||
# Boxplot by Habits_Count
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(x='Habits_Count', y='Happiness', data=df, palette='viridis')
|
||||
plt.title('Daily Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Number of habits followed (0–3)')
|
||||
plt.ylabel('Happiness (1–10)')
|
||||
f1 = outdir / 'happiness_by_habits_box.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f1)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Violin / jitter + regression
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.violinplot(x='Habits_Count', y='Happiness', data=df, inner=None, palette='muted')
|
||||
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
|
||||
plt.title('Happiness distribution by Habits Completed')
|
||||
f2 = outdir / 'happiness_by_habits_violin.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f2)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Participant average bar
|
||||
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
|
||||
plt.figure(figsize=(12, 5))
|
||||
sns.barplot(x=participant_avg.index.astype(str), y=participant_avg.values, palette='coolwarm')
|
||||
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
|
||||
plt.xticks(rotation=45)
|
||||
plt.title('Average Happiness per Participant (sorted)')
|
||||
f3 = outdir / 'participant_avg_happiness.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f3)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(7, 5))
|
||||
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', palette='Set2')
|
||||
plt.title('Mean Happiness by Group')
|
||||
plt.ylabel('Average happiness')
|
||||
f_group = outdir / 'happiness_by_group.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f_group)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Scatter with linear fit
|
||||
plt.figure(figsize=(9, 6))
|
||||
if 'Group' in df.columns:
|
||||
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
|
||||
else:
|
||||
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
|
||||
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
|
||||
f4 = outdir / 'happiness_vs_habits_regression.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f4)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
227
dot_config/private_Code/User/History/7da6e0fb/yfjL.py
Normal file
227
dot_config/private_Code/User/History/7da6e0fb/yfjL.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from scipy import stats
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.formula.api as smf
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
|
||||
def load_data(path):
|
||||
df = pd.read_csv(path)
|
||||
logging.info("Loaded %d rows from %s", len(df), path)
|
||||
return df
|
||||
|
||||
|
||||
def prepare_data(df):
|
||||
# Ensure required columns exist
|
||||
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
|
||||
missing = required - set(df.columns)
|
||||
if missing:
|
||||
raise KeyError(f"Missing required columns: {missing}")
|
||||
|
||||
if 'Group' not in df.columns:
|
||||
df['Group'] = 'Intervention'
|
||||
df['Group'] = df['Group'].astype(str).str.strip().str.title()
|
||||
|
||||
# Normalize adherence to boolean (Yes/No or True/False)
|
||||
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
|
||||
|
||||
# Count habits per row
|
||||
df['Habits_Count'] = (
|
||||
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
|
||||
)
|
||||
|
||||
# Coerce Happiness to numeric and drop rows without Happiness
|
||||
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
|
||||
before = len(df)
|
||||
df = df.dropna(subset=['Happiness'])
|
||||
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def descriptive_stats(df):
|
||||
print('Dataset shape:', df.shape)
|
||||
print('\nOverall summary:')
|
||||
print(df['Happiness'].describe())
|
||||
|
||||
if 'Group' in df.columns:
|
||||
print('\nRows by group:')
|
||||
print(df['Group'].value_counts())
|
||||
|
||||
print('\nAverage happiness by group:')
|
||||
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nAverage happiness by number of habits completed:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
|
||||
|
||||
print('\nMedian happiness by habits:')
|
||||
print(df.groupby('Habits_Count')['Happiness'].median())
|
||||
|
||||
# Correlations
|
||||
print('\nPearson correlation between Habits_Count and Happiness:')
|
||||
print(df[['Habits_Count', 'Happiness']].corr().round(3))
|
||||
|
||||
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
|
||||
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
|
||||
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
|
||||
mask = ~habit_df[habit].isna()
|
||||
if mask.sum() == 0:
|
||||
print(f'{habit:22} (no data)')
|
||||
continue
|
||||
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
|
||||
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
|
||||
|
||||
|
||||
def cohen_d(x, y):
|
||||
# Cohen's d for two independent samples
|
||||
nx, ny = len(x), len(y)
|
||||
dof = nx + ny - 2
|
||||
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
|
||||
return (x.mean() - y.mean()) / pooled_sd
|
||||
|
||||
|
||||
def run_ols(df):
|
||||
if 'Group' in df.columns:
|
||||
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
|
||||
print('\nOLS regression: Happiness ~ Habits_Count + Group')
|
||||
else:
|
||||
X = sm.add_constant(df['Habits_Count'])
|
||||
y = df['Happiness']
|
||||
model = sm.OLS(y, X).fit()
|
||||
print('\nSimple OLS regression: Happiness ~ Habits_Count')
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
|
||||
def run_mixedlm(df):
|
||||
# Random intercept for Participant_ID
|
||||
try:
|
||||
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
|
||||
mdf = md.fit(reml=False)
|
||||
print('\nMixed-effects model (random intercept by Participant_ID):')
|
||||
print(mdf.summary())
|
||||
return mdf
|
||||
except Exception as e:
|
||||
logging.warning('MixedLM failed: %s', e)
|
||||
return None
|
||||
|
||||
|
||||
def make_plots(df, outdir, show_plots=False):
|
||||
outdir = Path(outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
sns.set_style('whitegrid')
|
||||
|
||||
# Boxplot by Habits_Count
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.boxplot(data=df, x='Habits_Count', y='Happiness', color='#4C72B0')
|
||||
plt.title('Daily Happiness by Number of Habits Completed')
|
||||
plt.xlabel('Number of habits followed (0–3)')
|
||||
plt.ylabel('Happiness (1–10)')
|
||||
f1 = outdir / 'happiness_by_habits_box.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f1)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Violin / jitter + regression
|
||||
plt.figure(figsize=(9, 6))
|
||||
sns.violinplot(data=df, x='Habits_Count', y='Happiness', inner=None, color='#55A868')
|
||||
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
|
||||
plt.title('Happiness distribution by Habits Completed')
|
||||
f2 = outdir / 'happiness_by_habits_violin.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f2)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Participant average bar
|
||||
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
|
||||
plt.figure(figsize=(12, 5))
|
||||
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, color='#C44E52')
|
||||
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
|
||||
plt.xticks(range(len(participant_avg)), participant_avg.index, rotation=45)
|
||||
plt.title('Average Happiness per Participant (sorted)')
|
||||
f3 = outdir / 'participant_avg_happiness.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f3)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
if 'Group' in df.columns:
|
||||
plt.figure(figsize=(7, 5))
|
||||
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', color='#8172B2')
|
||||
plt.title('Mean Happiness by Group')
|
||||
plt.ylabel('Average happiness')
|
||||
f_group = outdir / 'happiness_by_group.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f_group)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
# Scatter with linear fit
|
||||
plt.figure(figsize=(9, 6))
|
||||
if 'Group' in df.columns:
|
||||
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
|
||||
else:
|
||||
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
|
||||
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
|
||||
f4 = outdir / 'happiness_vs_habits_regression.png'
|
||||
plt.tight_layout()
|
||||
plt.savefig(f4)
|
||||
if show_plots:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
logging.info('Saved plots to %s', outdir)
|
||||
|
||||
|
||||
def main(args):
|
||||
df = load_data(args.data)
|
||||
df = prepare_data(df)
|
||||
|
||||
descriptive_stats(df)
|
||||
|
||||
# Effect sizes
|
||||
group0 = df[df['Habits_Count'] == 0]['Happiness']
|
||||
group3 = df[df['Habits_Count'] == 3]['Happiness']
|
||||
if len(group0) > 1 and len(group3) > 1:
|
||||
d = cohen_d(group3, group0)
|
||||
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
|
||||
|
||||
if 'Group' in df.columns:
|
||||
control = df[df['Group'] == 'Control']['Happiness']
|
||||
intervention = df[df['Group'] == 'Intervention']['Happiness']
|
||||
if len(control) > 1 and len(intervention) > 1:
|
||||
d_group = cohen_d(intervention, control)
|
||||
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
|
||||
|
||||
# Models
|
||||
run_ols(df)
|
||||
run_mixedlm(df)
|
||||
|
||||
# Plots
|
||||
make_plots(df, args.outdir, show_plots=args.show)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
|
||||
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
|
||||
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
|
||||
parser.add_argument('--show', action='store_true', help='Show plots interactively')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
Loading…
Add table
Add a link
Reference in a new issue