Update Fedora state: 2026-04-29 11:50

This commit is contained in:
Breadway 2026-04-29 11:50:42 +08:00
parent 42ca768584
commit 10f0d5de1d
338 changed files with 18983 additions and 32 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/organization_happiness_study_data.csv","entries":[{"id":"0Ago.csv","timestamp":1774348491393}]}

View file

@ -0,0 +1,78 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
# Load the data
df = pd.read_csv('organization_happiness_study_data.csv')
print("=" * 75)
print("UPWARD TREND VERIFICATION - HAPPINESS GROWTH WITH HABIT COMPLETION")
print("=" * 75)
# Calculate habit completion count
df['Habits_Count'] = (
(df['Calendar_Adherence'] == 'Yes').astype(int) +
(df['Cleanliness_Adherence'] == 'Yes').astype(int) +
(df['Punctuality_Adherence'] == 'Yes').astype(int)
)
print("\n--- Intervention Group: Early vs Late Month ---")
intervention = df[df['Group'] == 'Intervention']
early_month = intervention[intervention['Day'] <= 10]
late_month = intervention[intervention['Day'] > 20]
print(f"Days 1-10 (Early):")
print(f" Mean Happiness: {early_month['Happiness'].mean():.2f}")
print(f" Mean Habits Completed: {early_month['Habits_Count'].mean():.2f}")
print(f"\nDays 21-30 (Late):")
print(f" Mean Happiness: {late_month['Happiness'].mean():.2f}")
print(f" Mean Habits Completed: {late_month['Habits_Count'].mean():.2f}")
print(f"\nGrowth: {late_month['Happiness'].mean() - early_month['Happiness'].mean():.2f} points")
print("\n--- Control Group: Early vs Late Month (Should be flat) ---")
control = df[df['Group'] == 'Control']
early_month_c = control[control['Day'] <= 10]
late_month_c = control[control['Day'] > 20]
print(f"Days 1-10 (Early):")
print(f" Mean Happiness: {early_month_c['Happiness'].mean():.2f}")
print(f"\nDays 21-30 (Late):")
print(f" Mean Happiness: {late_month_c['Happiness'].mean():.2f}")
print(f"\nChange: {late_month_c['Happiness'].mean() - early_month_c['Happiness'].mean():.2f} points (should be ~0)")
print("\n--- Direct Correlation: Intervention Group by Habits Completed ---")
for habit_count in [0, 1, 2, 3]:
subset = intervention[intervention['Habits_Count'] == habit_count]
if len(subset) > 0:
print(f"{habit_count} habits completed: Happiness = {subset['Happiness'].mean():.2f} (n={len(subset)})")
print("\n--- Trend Over 30 Days (Intervention Group) ---")
weekly_avg = intervention.groupby('Day')['Happiness'].mean()
print(f"Week 1 (Days 1-7): Average Happiness = {weekly_avg[1:8].mean():.2f}")
print(f"Week 2 (Days 8-14): Average Happiness = {weekly_avg[8:15].mean():.2f}")
print(f"Week 3 (Days 15-21): Average Happiness = {weekly_avg[15:22].mean():.2f}")
print(f"Week 4 (Days 22-30): Average Happiness = {weekly_avg[22:31].mean():.2f}")
print("\n--- Trend Over 30 Days (Control Group) ---")
weekly_avg_c = control.groupby('Day')['Happiness'].mean()
print(f"Week 1 (Days 1-7): Average Happiness = {weekly_avg_c[1:8].mean():.2f}")
print(f"Week 2 (Days 8-14): Average Happiness = {weekly_avg_c[8:15].mean():.2f}")
print(f"Week 3 (Days 15-21): Average Happiness = {weekly_avg_c[15:22].mean():.2f}")
print(f"Week 4 (Days 22-30): Average Happiness = {weekly_avg_c[22:31].mean():.2f}")
print("\n--- Participant Examples (Intervention Group) ---")
for pid in [1, 5, 10]:
p_data = intervention[intervention['Participant_ID'] == pid]
early = p_data[p_data['Day'] <= 10]['Happiness'].mean()
late = p_data[p_data['Day'] > 20]['Happiness'].mean()
early_habits = p_data[p_data['Day'] <= 10]['Habits_Count'].mean()
late_habits = p_data[p_data['Day'] > 20]['Habits_Count'].mean()
print(f"\nParticipant {pid}:")
print(f" Early (Days 1-10): Happiness {early:.1f}, Habits {early_habits:.1f}/day")
print(f" Late (Days 21-30): Happiness {late:.1f}, Habits {late_habits:.1f}/day")
print(f" Growth: {late - early:.1f} points")
print("\n✓ Data shows:")
print(" • Intervention group has upward trend over study period")
print(" • Happy days strongly correlated with habit completion")
print(" • Control group stays stable with natural random variation")

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/verify_trend.py","entries":[{"id":"DwLj.py","source":"Chat Edit: 'can you ensure the data shows an upward trend in happiness as the study goes on, and in direct correlation with the habits completed by that participant? at the moment, the intervention group is happier after a single day.'","timestamp":1774347397523}]}

View file

@ -0,0 +1 @@
{"version":1,"resource":"vscode-userdata:/home/breadway/.config/Code/User/settings.json","entries":[{"id":"jeJQ.json","timestamp":1774363216206}]}

View file

@ -0,0 +1,3 @@
{
"explorer.confirmDelete": false
}

View file

@ -0,0 +1,198 @@
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
group_add:
- "993" # render group for VAAPI hardware acceleration
ports:
- "8096:8096" # HTTP web UI
- "8920:8920" # HTTPS
- "7359:7359/udp" # Network discovery
- "1900:1900/udp" # DLNA
expose:
- "8096"
environment:
- PUID=1000
- PGID=1000
- TZ=Australia/Perth
volumes:
# Config on NVMe (fast)
- ./config:/config
- ./cache:/cache
# Media libraries (read-only for safety)
- "/mnt/media/Movies:/media/movies:ro"
- "/mnt/media/TV Shows:/media/tv-shows:ro"
- "/mnt/media/Anime:/media/anime:ro"
- "/mnt/media/Kids TV:/media/kids-tv:ro"
- "/mnt/media/Kids Movies:/media/kids-movies:ro"
- "/tank/home-videos:/media/home-videos:ro"
- "/tank/videos:/media/home-videos:ro"
- "/tank/photos:/media/home-photos:ro"
devices:
# Hardware transcoding (Vega graphics)
- /dev/dri:/dev/dri
networks:
- jellyfin-net
qbittorrent:
image: linuxserver/qbittorrent:latest
container_name: qbittorrent
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
WEBUI_PORT: 8090
volumes:
- ~/.docker_volumes/qbittorrent/config:/config
- /mnt/media/downloads:/downloads
- /mnt/media/anime:/animeq
ports:
- "8090:8090"
- "6881:6881"
- "6881:6881/udp"
expose:
- "8090"
restart: unless-stopped
deploy:
resources:
limits:
memory: 1G
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
sonarr:
image: linuxserver/sonarr:latest
container_name: sonarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
DOCKER_MODS: "linuxserver/mods:universal-package-install"
INSTALL_PACKAGES: "ffmpeg"
volumes:
- ~/.docker_volumes/sonarr/config:/config
- /mnt/media/Anime:/tv
- ~/media/downloads:/downloads
ports:
- "8989:8989"
expose:
- "8989"
restart: unless-stopped
depends_on:
- qbittorrent
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
prowlarr:
image: linuxserver/prowlarr:latest
container_name: prowlarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/prowlarr/config:/config
ports:
- "9696:9696"
expose:
- "9696"
restart: unless-stopped
depends_on:
- qbittorrent
- sonarr
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9696/ping"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
jellyseerr:
image: fallenbagel/jellyseerr:latest
container_name: jellyseerr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/jellyseerr/config:/app/config
ports:
- "5055:5055"
expose:
- "5055"
restart: unless-stopped
depends_on:
- jellyfin
- sonarr
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5055/api/v1/status"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
caddy:
image: caddy:latest
container_name: caddy
restart: unless-stopped
ports:
- "443:443"
- "443:443/udp" # For HTTP/3 support
volumes:
- ./Caddyfile:/etc/caddy/Caddyfile
- ./caddy_data:/data
- ./caddy_config:/config
networks:
- jellyfin-net
crowdsec:
image: crowdsecurity/crowdsec:latest
container_name: crowdsec
restart: unless-stopped
environment:
- COLLECTIONS=crowdsecurity/linux crowdsecurity/caddy crowdsecurity/base-httping
volumes:
- /var/log:/var/log:ro
- ./crowdsec_data:/var/lib/crowdsec/data
- ./crowdsec_config:/etc/crowdsec
networks:
- jellyfin-net
networks:
jellyfin-net:
driver: bridge

View file

@ -0,0 +1,174 @@
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
group_add:
- "993" # render group for VAAPI hardware acceleration
ports:
- "8096:8096" # HTTP web UI
- "8920:8920" # HTTPS
- "7359:7359/udp" # Network discovery
- "1900:1900/udp" # DLNA
expose:
- "8096"
environment:
- PUID=1000
- PGID=1000
- TZ=Australia/Perth
volumes:
# Config on NVMe (fast)
- ./config:/config
- ./cache:/cache
# Media libraries (read-only for safety)
- "/mnt/media/Movies:/media/movies:ro"
- "/mnt/media/TV Shows:/media/tv-shows:ro"
- "/mnt/media/Anime:/media/anime:ro"
- "/mnt/media/Kids TV:/media/kids-tv:ro"
- "/mnt/media/Kids Movies:/media/kids-movies:ro"
- "/tank/home-videos:/media/home-videos:ro"
- "/tank/videos:/media/home-videos:ro"
- "/tank/photos:/media/home-photos:ro"
devices:
# Hardware transcoding (Vega graphics)
- /dev/dri:/dev/dri
networks:
- jellyfin-net
qbittorrent:
image: linuxserver/qbittorrent:latest
container_name: qbittorrent
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
WEBUI_PORT: 8090
volumes:
- ~/.docker_volumes/qbittorrent/config:/config
- /mnt/media/downloads:/downloads
- /mnt/media/anime:/animeq
ports:
- "8090:8090"
- "6881:6881"
- "6881:6881/udp"
expose:
- "8090"
restart: unless-stopped
deploy:
resources:
limits:
memory: 1G
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
sonarr:
image: linuxserver/sonarr:latest
container_name: sonarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
DOCKER_MODS: "linuxserver/mods:universal-package-install"
INSTALL_PACKAGES: "ffmpeg"
volumes:
- ~/.docker_volumes/sonarr/config:/config
- /mnt/media/Anime:/tv
- ~/media/downloads:/downloads
ports:
- "8989:8989"
expose:
- "8989"
restart: unless-stopped
depends_on:
- qbittorrent
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
prowlarr:
image: linuxserver/prowlarr:latest
container_name: prowlarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/prowlarr/config:/config
ports:
- "9696:9696"
expose:
- "9696"
restart: unless-stopped
depends_on:
- qbittorrent
- sonarr
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9696/ping"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
jellyseerr:
image: fallenbagel/jellyseerr:latest
container_name: jellyseerr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/jellyseerr/config:/app/config
ports:
- "5055:5055"
expose:
- "5055"
restart: unless-stopped
depends_on:
- jellyfin
- sonarr
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5055/api/v1/status"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
networks:
jellyfin-net:
driver: bridge

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Downloads/docker-compose.yml","entries":[{"id":"ebbC.yml","source":"textFileCreate.source","timestamp":1775500296675},{"id":"UHTA.yml","timestamp":1775500565545},{"id":"xBDr.yml","timestamp":1775500662427},{"id":"i7DI.yml","timestamp":1775502657849}]}

View file

@ -0,0 +1,208 @@
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
group_add:
- "993" # render group for VAAPI hardware acceleration
ports:
- "8096:8096" # HTTP web UI
- "8920:8920" # HTTPS
- "7359:7359/udp" # Network discovery
- "1900:1900/udp" # DLNA
expose:
- "8096"
environment:
- PUID=1000
- PGID=1000
- TZ=Australia/Perth
volumes:
# Config on NVMe (fast)
- ./config:/config
- ./cache:/cache
# Media libraries (read-only for safety)
- "/mnt/media/Movies:/media/movies:ro"
- "/mnt/media/TV Shows:/media/tv-shows:ro"
- "/mnt/media/Anime:/media/anime:ro"
- "/mnt/media/Kids TV:/media/kids-tv:ro"
- "/mnt/media/Kids Movies:/media/kids-movies:ro"
- "/tank/home-videos:/media/home-videos:ro"
- "/tank/videos:/media/home-videos:ro"
- "/tank/photos:/media/home-photos:ro"
devices:
# Hardware transcoding (Vega graphics)
- /dev/dri:/dev/dri
networks:
- jellyfin-net
qbittorrent:
image: linuxserver/qbittorrent:latest
container_name: qbittorrent
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
WEBUI_PORT: 8090
volumes:
- ~/.docker_volumes/qbittorrent/config:/config
- /mnt/media/downloads:/downloads
- /mnt/media/anime:/animeq
ports:
- "8090:8090"
- "6881:6881"
- "6881:6881/udp"
expose:
- "8090"
restart: unless-stopped
deploy:
resources:
limits:
memory: 1G
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
sonarr:
image: linuxserver/sonarr:latest
container_name: sonarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
DOCKER_MODS: "linuxserver/mods:universal-package-install"
INSTALL_PACKAGES: "ffmpeg"
volumes:
- ~/.docker_volumes/sonarr/config:/config
- /mnt/media/Anime:/tv
- ~/media/downloads:/downloads
ports:
- "8989:8989"
expose:
- "8989"
restart: unless-stopped
depends_on:
- qbittorrent
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
prowlarr:
image: linuxserver/prowlarr:latest
container_name: prowlarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/prowlarr/config:/config
ports:
- "9696:9696"
expose:
- "9696"
restart: unless-stopped
depends_on:
- qbittorrent
- sonarr
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9696/ping"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
jellyseerr:
image: fallenbagel/jellyseerr:latest
container_name: jellyseerr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/jellyseerr/config:/app/config
ports:
- "5055:5055"
expose:
- "5055"
restart: unless-stopped
depends_on:
- jellyfin
- sonarr
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5055/api/v1/status"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
caddy:
build:
context: .
dockerfile_inline: |
FROM caddy:builder AS builder
RUN xcaddy build --with github.com/caddy-dns/namedotcom
FROM caddy:latest
COPY --from=builder /usr/bin/caddy /usr/bin/caddy
container_name: caddy
restart: unless-stopped
environment:
- NAMEDOTCOM_USERNAME=
- NAMEDOTCOM_TOKEN=your_api_token
ports:
- "443:443"
- "443:443/udp"
volumes:
- ./Caddyfile:/etc/caddy/Caddyfile
- ./caddy_data:/data
- ./caddy_config:/config
networks:
- jellyfin-net
crowdsec:
image: crowdsecurity/crowdsec:latest
container_name: crowdsec
restart: unless-stopped
environment:
- COLLECTIONS=crowdsecurity/linux crowdsecurity/caddy crowdsecurity/base-httping
volumes:
- /var/log:/var/log:ro
- ./crowdsec_data:/var/lib/crowdsec/data
- ./crowdsec_config:/etc/crowdsec
networks:
- jellyfin-net
networks:
jellyfin-net:
driver: bridge

View file

@ -0,0 +1,199 @@
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
restart: unless-stopped
group_add:
- "993" # render group for VAAPI hardware acceleration
ports:
- "8096:8096" # HTTP web UI
- "8920:8920" # HTTPS
- "7359:7359/udp" # Network discovery
- "1900:1900/udp" # DLNA
expose:
- "8096"
environment:
- PUID=1000
- PGID=1000
- TZ=Australia/Perth
volumes:
# Config on NVMe (fast)
- ./config:/config
- ./cache:/cache
# Media libraries (read-only for safety)
- "/mnt/media/Movies:/media/movies:ro"
- "/mnt/media/TV Shows:/media/tv-shows:ro"
- "/mnt/media/Anime:/media/anime:ro"
- "/mnt/media/Kids TV:/media/kids-tv:ro"
- "/mnt/media/Kids Movies:/media/kids-movies:ro"
- "/tank/home-videos:/media/home-videos:ro"
- "/tank/videos:/media/home-videos:ro"
- "/tank/photos:/media/home-photos:ro"
devices:
# Hardware transcoding (Vega graphics)
- /dev/dri:/dev/dri
networks:
- jellyfin-net
qbittorrent:
image: linuxserver/qbittorrent:latest
container_name: qbittorrent
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
WEBUI_PORT: 8090
volumes:
- ~/.docker_volumes/qbittorrent/config:/config
- /mnt/media/downloads:/downloads
- /mnt/media/anime:/animeq
ports:
- "8090:8090"
- "6881:6881"
- "6881:6881/udp"
expose:
- "8090"
restart: unless-stopped
deploy:
resources:
limits:
memory: 1G
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
sonarr:
image: linuxserver/sonarr:latest
container_name: sonarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
DOCKER_MODS: "linuxserver/mods:universal-package-install"
INSTALL_PACKAGES: "ffmpeg"
volumes:
- ~/.docker_volumes/sonarr/config:/config
- /mnt/media/Anime:/tv
- ~/media/downloads:/downloads
ports:
- "8989:8989"
expose:
- "8989"
restart: unless-stopped
depends_on:
- qbittorrent
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
prowlarr:
image: linuxserver/prowlarr:latest
container_name: prowlarr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/prowlarr/config:/config
ports:
- "9696:9696"
expose:
- "9696"
restart: unless-stopped
depends_on:
- qbittorrent
- sonarr
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9696/ping"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
jellyseerr:
image: fallenbagel/jellyseerr:latest
container_name: jellyseerr
environment:
PUID: 1000
PGID: 1000
TZ: "${TZ}"
volumes:
- ~/.docker_volumes/jellyseerr/config:/app/config
ports:
- "5055:5055"
expose:
- "5055"
restart: unless-stopped
depends_on:
- jellyfin
- sonarr
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5055/api/v1/status"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
caddy:
image: caddy:latest
container_name: caddy
restart: unless-stopped
ports:
- "443:443"
- "443:443/udp" # For HTTP/3 support
volumes:
- ./Caddyfile:/etc/caddy/Caddyfile
- ./caddy_data:/data
- ./caddy_config:/config
networks:
- jellyfin-net
crowdsec:
image: crowdsecurity/crowdsec:latest
container_name: crowdsec
restart: unless-stopped
environment:
- COLLECTIONS=crowdsecurity/linux crowdsecurity/caddy crowdsecurity/base-httping
volumes:
- /var/log:/var/log:ro
- ./crowdsec_data:/var/lib/crowdsec/data
- ./crowdsec_config:/etc/crowdsec
networks:
- jellyfin-net
networks:
jellyfin-net:
driver: bridge

View file

@ -0,0 +1,18 @@
#!/usr/bin/env python3
"""Quick test to verify Data Analysis.py works correctly."""
import subprocess
import sys
result = subprocess.run([sys.executable, 'Data Analysis.py'], capture_output=True, text=True, timeout=30)
print("STDOUT:")
print(result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout)
print("\nSTDERR:")
print(result.stderr[-1000:] if len(result.stderr) > 1000 else result.stderr)
print(f"\nExit code: {result.returncode}")
# Check for plot files
import os
plot_files = sorted([f for f in os.listdir('plots') if f.endswith('.png')])
print(f"\nGenerated {len(plot_files)} plot files:")
for f in plot_files:
print(f" - {f}")

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/test_analysis.py","entries":[{"id":"Xb2Q.py","source":"Chat Edit: 'ensure the graphs being used are appropriate for the study'","timestamp":1774346851499}]}

View file

@ -0,0 +1,77 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
# Load the data
df = pd.read_csv('organization_happiness_study_data.csv')
print("=" * 70)
print("DATA GENERATION IMPROVEMENTS VERIFICATION")
print("=" * 70)
print(f"\n✓ Dataset shape: {df.shape}")
print(f"✓ Total rows: {len(df)} (20 participants × 30 days × 2 groups = 1200 expected)")
print("\n--- Intervention Group Statistics ---")
intervention = df[df['Group'] == 'Intervention']
print(f"Participants: {intervention['Participant_ID'].nunique()}")
print(f"Mean Happiness: {intervention['Happiness'].mean():.2f}")
print(f"Happiness Std Dev: {intervention['Happiness'].std():.2f}")
print(f"Calendar Adherence Rate: {(intervention['Calendar_Adherence'] == 'Yes').mean():.1%}")
print(f"Cleanliness Adherence Rate: {(intervention['Cleanliness_Adherence'] == 'Yes').mean():.1%}")
print(f"Punctuality Adherence Rate: {(intervention['Punctuality_Adherence'] == 'Yes').mean():.1%}")
print("\n--- Control Group Statistics ---")
control = df[df['Group'] == 'Control']
print(f"Participants: {control['Participant_ID'].nunique()}")
print(f"Mean Happiness: {control['Happiness'].mean():.2f}")
print(f"Happiness Std Dev: {control['Happiness'].std():.2f}")
print(f"Reported Calendar: {(control['Calendar_Adherence'] == 'Yes').mean():.1%} (should be ~0%)")
print(f"Reported Cleanliness: {(control['Cleanliness_Adherence'] == 'Yes').mean():.1%} (should be ~0%)")
print(f"Reported Punctuality: {(control['Punctuality_Adherence'] == 'Yes').mean():.1%} (should be ~0%)")
print("\n--- Natural Data Patterns ---")
# Check for habit momentum (persistence)
intervention['Habits_Count'] = (
(intervention['Calendar_Adherence'] == 'Yes').astype(int) +
(intervention['Cleanliness_Adherence'] == 'Yes').astype(int) +
(intervention['Punctuality_Adherence'] == 'Yes').astype(int)
)
print(f"Habit completion rates by number completed:")
for count in [0, 1, 2, 3]:
subset = intervention[intervention['Habits_Count'] == count]
happiness = subset['Happiness'].mean()
print(f" {count} habits: Happiness = {happiness:.2f} (n={len(subset)})")
# Weekend effect
intervention['DayOfWeek'] = intervention['Day'] % 7
weekend = intervention[intervention['DayOfWeek'].isin([0, 6])]
weekday = intervention[~intervention['DayOfWeek'].isin([0, 6])]
print(f"\nWeekend vs Weekday Adherence:")
print(f" Weekday avg habits: {((weekday['Calendar_Adherence']=='Yes').astype(int) + (weekday['Cleanliness_Adherence']=='Yes').astype(int) + (weekday['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}")
print(f" Weekend avg habits: {((weekend['Calendar_Adherence']=='Yes').astype(int) + (weekend['Cleanliness_Adherence']=='Yes').astype(int) + (weekend['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}")
# Habit formation over time
first_week = intervention[intervention['Day'] <= 7]
mid_month = intervention[(intervention['Day'] > 14) & (intervention['Day'] <= 21)]
last_week = intervention[intervention['Day'] > 23]
print(f"\nHabit Formation Over Time:")
print(f" Days 1-7 (Starting): Avg habits = {((first_week['Calendar_Adherence']=='Yes').astype(int) + (first_week['Cleanliness_Adherence']=='Yes').astype(int) + (first_week['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}")
print(f" Days 15-21 (Momentum): Avg habits = {((mid_month['Calendar_Adherence']=='Yes').astype(int) + (mid_month['Cleanliness_Adherence']=='Yes').astype(int) + (mid_month['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}")
print(f" Days 24-30 (Late): Avg habits = {((last_week['Calendar_Adherence']=='Yes').astype(int) + (last_week['Cleanliness_Adherence']=='Yes').astype(int) + (last_week['Punctuality_Adherence']=='Yes').astype(int)).mean():.2f}")
print(f"\nHappiness Persistence (day-to-day correlation):")
intervention_sorted = intervention.sort_values(['Participant_ID', 'Day'])
intervention_sorted['Happiness_prev'] = intervention_sorted.groupby('Participant_ID')['Happiness'].shift(1)
valid = intervention_sorted[intervention_sorted['Happiness_prev'].notna()]
corr = valid[['Happiness', 'Happiness_prev']].corr().iloc[0, 1]
print(f" Correlation between today and yesterday's happiness: {corr:.3f}")
print("\n✓ Data generation complete with natural patterns!")
print("\nKey improvements:")
print(" • Habit momentum: doing it yesterday makes it more likely today")
print(" • Weekly patterns: lower adherence weekends vs weekdays")
print(" • Habit formation: initial difficulty, momentum building, slight fatigue")
print(" • Individual variation: each person has unique habit profiles")
print(" • Happiness persistence: today's mood influenced by yesterday's")
print(" • Control group realism: still report 'No' but data shows natural variation")

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/test_improvements.py","entries":[{"id":"bWGM.py","source":"Chat Edit: 'improve data gen to create more natural data'","timestamp":1774347099618}]}

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/run_and_verify.py","entries":[{"id":"qMUb.py","source":"Chat Edit: 'can you ensure the data shows an upward trend in happiness as the study goes on, and in direct correlation with the habits completed by that participant? at the moment, the intervention group is happier after a single day.'","timestamp":1774347459417}]}

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
"""Generate new data and display sample showing upward trend"""
import subprocess
import pandas as pd
# Run data generator
result = subprocess.run(['python3', 'Data Gen.py'], capture_output=True, text=True)
print(result.stdout)
if result.stderr:
print("Errors:", result.stderr)
# Load and display trend analysis
df = pd.read_csv('organization_happiness_study_data.csv')
df['Habits_Count'] = (
(df['Calendar_Adherence'] == 'Yes').astype(int) +
(df['Cleanliness_Adherence'] == 'Yes').astype(int) +
(df['Punctuality_Adherence'] == 'Yes').astype(int)
)
intervention = df[df['Group'] == 'Intervention']
control = df[df['Group'] == 'Control']
print("\n" + "="*70)
print("UPWARD TREND ANALYSIS")
print("="*70)
print("\n[INTERVENTION GROUP] - Should show upward trend")
early_int = intervention[intervention['Day'] <= 7]
late_int = intervention[intervention['Day'] >= 24]
print(f"Days 1-7: Avg Happiness = {early_int['Happiness'].mean():.2f}")
print(f"Days 24-30: Avg Happiness = {late_int['Happiness'].mean():.2f}")
print(f"GROWTH: +{late_int['Happiness'].mean() - early_int['Happiness'].mean():.2f} points\n")
print("[CONTROL GROUP] - Should show flat/random pattern")
early_ctl = control[control['Day'] <= 7]
late_ctl = control[control['Day'] >= 24]
print(f"Days 1-7: Avg Happiness = {early_ctl['Happiness'].mean():.2f}")
print(f"Days 24-30: Avg Happiness = {late_ctl['Happiness'].mean():.2f}")
print(f"CHANGE: {late_ctl['Happiness'].mean() - early_ctl['Happiness'].mean():+.2f} points\n")
print("[HABIT CORRELATION] - More habits = Higher happiness")
for habits in range(4):
subset = intervention[intervention['Habits_Count'] == habits]
if len(subset) > 0:
print(f"{habits} habits/day: Avg Happiness = {subset['Happiness'].mean():.2f} ({len(subset)} observations)")

View file

@ -0,0 +1,196 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.8, 1.1) # Lower starting point for growth
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 4)
habit_strength = np.clip(habit_strength, 0, 4)
# Happiness is baseline + growth from habit_strength over time
# As study progresses and habit_strength builds, happiness increases more
study_progress = day / 30.0 # 0.033 to 1.0 over 30 days
# Daily random noise (small)
daily_noise = np.random.normal(0, 0.7)
# Happiness formula: baseline + cumulative effect that strengthens over time
happiness_value = (
person_happiness_baseline + # Starting point
habit_strength * (0.5 + study_progress) + # Habit benefits grow over time
daily_noise # Day-to-day variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(4.8, 1.3) # Same baseline as intervention (no advantage)
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.2)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,200 @@
import pandas as pd
import numpy as np
np.random.seed(3) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 5)
habit_strength = np.clip(habit_strength, 0, 5)
# Happiness combines DAILY habits effect + cumulative habit strength
study_progress = day / 30.0 # 0.033 to 1.0
daily_noise = np.random.normal(0, 0.35)
# Immediate bonus for today's habits (strong, clear dose-response)
daily_habit_bonus = adherence_count * 0.6 # 0-1.8 based on today's habits
# Cumulative bonus grows as study progresses
cumulative_bonus = habit_strength * (0.4 + study_progress * 0.2) # max ~2.7
# Happiness formula: baseline + daily effect + cumulative effect + noise
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
daily_habit_bonus + # Today's habits (0-1.8)
cumulative_bonus + # Study progress bonus (0-2.7)
daily_noise # Variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(5.1, 0.9) # Center control around ~5
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.0)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,93 @@
df = pd.DataFrame(data, columns=['Participant_ID', 'Day', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence', 'Happiness'])
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.7, 0.15) # each person has their own organization tendency
for day in DAYS:
calendar = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(org_bias + 0.1, 0.95), 1 - clip_yes_prob(org_bias + 0.1, 0.95)])
clean = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(org_bias, 0.90), 1 - clip_yes_prob(org_bias, 0.90)])
ontime = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(org_bias + 0.05, 0.92), 1 - clip_yes_prob(org_bias + 0.05, 0.92)])
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
baseline_happiness = np.random.normal(5.5, 1.0)
happiness = int(np.clip(np.random.normal(baseline_happiness + adherence_count * 1.1, 1.2), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
for day in DAYS:
# Control group only records happiness; all habit columns are No.
calendar = 'No'
clean = 'No'
ontime = 'No'
baseline_happiness = np.random.normal(5.5, 1.0)
control_noise = np.random.normal(0.0, 1.1)
happiness = int(np.clip(baseline_happiness + control_noise, 1, 10))
rows.append([
participant_id,
'Control',
day,
calendar,
clean,
ontime,
happiness,
])
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,180 @@
import pandas as pd
import numpy as np
np.random.seed(64) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness for this participant
person_happiness_baseline = np.random.normal(5.5, 1.2)
current_happiness = person_happiness_baseline
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Habit stacking: completing one habit makes the next easier
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
habit_boost = adherence_count * 1.2 if adherence_count > 0 else 0
# Happiness has persistence but is also affected by habits
happiness_noise = np.random.normal(0, 1.3)
current_happiness = np.clip(
current_happiness * 0.4 + # Previous day influences today
person_happiness_baseline * 0.4 +
habit_boost * 0.9 + # Habits have strong effect
happiness_noise,
1, 10
)
happiness = int(np.round(current_happiness))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(5.0, 1.3) # Slightly lower baseline for control
current_happiness = person_happiness_baseline
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but it still affects their happiness sublimely
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.5 # Smaller effect since unaware/untracked
# Control group happiness is less affected by daily habits and more random
happiness_noise = np.random.normal(0, 1.6) # Higher variability since no tracking
current_happiness = np.clip(
current_happiness * 0.5 +
person_happiness_baseline * 0.5 +
subtle_boost +
happiness_noise,
1, 10
)
happiness = int(np.round(current_happiness))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,200 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 5)
habit_strength = np.clip(habit_strength, 0, 5)
# Happiness combines DAILY habits effect + cumulative habit strength
study_progress = day / 30.0 # 0.033 to 1.0
daily_noise = np.random.normal(0, 0.35)
# Immediate bonus for today's habits (strong, clear dose-response)
daily_habit_bonus = adherence_count * 0.6 # 0-1.8 based on today's habits
# Cumulative bonus grows as study progresses
cumulative_bonus = habit_strength * (0.4 + study_progress * 0.2) # max ~2.7
# Happiness formula: baseline + daily effect + cumulative effect + noise
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
daily_habit_bonus + # Today's habits (0-1.8)
cumulative_bonus + # Study progress bonus (0-2.7)
daily_noise # Variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(4.9, 0.9) # Center control around ~5
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.0)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,200 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 5)
habit_strength = np.clip(habit_strength, 0, 5)
# Happiness combines DAILY habits effect + cumulative habit strength
study_progress = day / 30.0 # 0.033 to 1.0
daily_noise = np.random.normal(0, 0.35)
# Immediate bonus for today's habits (strong, clear dose-response)
daily_habit_bonus = adherence_count * 0.6 # 0-1.8 based on today's habits
# Cumulative bonus grows as study progresses
cumulative_bonus = habit_strength * (0.4 + study_progress * 0.2) # max ~2.7
# Happiness formula: baseline + daily effect + cumulative effect + noise
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
daily_habit_bonus + # Today's habits (0-1.8)
cumulative_bonus + # Study progress bonus (0-2.7)
daily_noise # Variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(4.0, 1.0) # Same baseline as intervention (no advantage)
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.2)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,92 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.7, 0.15) # each person has their own organization tendency
for day in DAYS:
calendar = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(org_bias + 0.1, 0.95), 1 - clip_yes_prob(org_bias + 0.1, 0.95)])
clean = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(org_bias, 0.90), 1 - clip_yes_prob(org_bias, 0.90)])
ontime = np.random.choice(['Yes', 'No'], p=[clip_yes_prob(org_bias + 0.05, 0.92), 1 - clip_yes_prob(org_bias + 0.05, 0.92)])
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
baseline_happiness = np.random.normal(5.5, 1.0)
happiness = int(np.clip(np.random.normal(baseline_happiness + adherence_count * 1.1, 1.2), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
for day in DAYS:
# Control group only records happiness; all habit columns are No.
calendar = 'No'
clean = 'No'
ontime = 'No'
baseline_happiness = np.random.normal(5.5, 1.0)
control_noise = np.random.normal(0.0, 1.1)
happiness = int(np.clip(baseline_happiness + control_noise, 1, 10))
rows.append([
participant_id,
'Control',
day,
calendar,
clean,
ontime,
happiness,
])
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,200 @@
import pandas as pd
import numpy as np
np.random.seed(43) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 5)
habit_strength = np.clip(habit_strength, 0, 5)
# Happiness combines DAILY habits effect + cumulative habit strength
study_progress = day / 30.0 # 0.033 to 1.0
daily_noise = np.random.normal(0, 0.35)
# Immediate bonus for today's habits (strong, clear dose-response)
daily_habit_bonus = adherence_count * 0.6 # 0-1.8 based on today's habits
# Cumulative bonus grows as study progresses
cumulative_bonus = habit_strength * (0.4 + study_progress * 0.2) # max ~2.7
# Happiness formula: baseline + daily effect + cumulative effect + noise
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
daily_habit_bonus + # Today's habits (0-1.8)
cumulative_bonus + # Study progress bonus (0-2.7)
daily_noise # Variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(5.1, 0.9) # Center control around ~5
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.0)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,180 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness for this participant
person_happiness_baseline = np.random.normal(5.5, 1.2)
current_happiness = person_happiness_baseline
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Habit stacking: completing one habit makes the next easier
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
habit_boost = adherence_count * 1.2 if adherence_count > 0 else 0
# Happiness has persistence but is also affected by habits
happiness_noise = np.random.normal(0, 1.3)
current_happiness = np.clip(
current_happiness * 0.4 + # Previous day influences today
person_happiness_baseline * 0.4 +
habit_boost * 0.9 + # Habits have strong effect
happiness_noise,
1, 10
)
happiness = int(np.round(current_happiness))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(5.0, 1.3) # Slightly lower baseline for control
current_happiness = person_happiness_baseline
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but it still affects their happiness sublimely
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.5 # Smaller effect since unaware/untracked
# Control group happiness is less affected by daily habits and more random
happiness_noise = np.random.normal(0, 1.6) # Higher variability since no tracking
current_happiness = np.clip(
current_happiness * 0.5 +
person_happiness_baseline * 0.5 +
subtle_boost +
happiness_noise,
1, 10
)
happiness = int(np.round(current_happiness))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,197 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 5)
habit_strength = np.clip(habit_strength, 0, 5)
# Happiness is baseline + growth from habit_strength over time
# As study progresses and habit_strength builds, happiness increases more
study_progress = day / 30.0 # 0.033 to 1.0 over 30 days
# Daily random noise (small)
daily_noise = np.random.normal(0, 0.4)
# Happiness formula: baseline + cumulative effect that strengthens over time
# Stronger multiplier to make dose-response clear (each habit matters 1-1.5 points)
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
habit_strength * (0.9 + study_progress * 0.3) + # Habit benefits clear, final max ~2.1
daily_noise # Day-to-day variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(4.0, 1.0) # Same baseline as intervention (no advantage)
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.2)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,197 @@
import pandas as pd
import numpy as np
np.random.seed(64) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.8, 1.1) # Lower starting point for growth
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 4)
habit_strength = np.clip(habit_strength, 0, 4)
# Happiness is baseline + growth from habit_strength over time
# As study progresses and habit_strength builds, happiness increases more
study_progress = day / 30.0 # 0.033 to 1.0 over 30 days
# Daily random noise (small)
daily_noise = np.random.normal(0, 0.7)
# Happiness formula: baseline + cumulative effect that strengthens over time
happiness_value = (
person_happiness_baseline + # Starting point
habit_strength * (0.5 + study_progress) + # Habit benefits grow over time
daily_noise # Day-to-day variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(5.0, 1.3) # Slightly lower baseline for control
current_happiness = person_happiness_baseline
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but it still affects their happiness sublimely
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.5 # Smaller effect since unaware/untracked
# Control group happiness is less affected by daily habits and more random
happiness_noise = np.random.normal(0, 1.6) # Higher variability since no tracking
current_happiness = np.clip(
current_happiness * 0.5 +
person_happiness_baseline * 0.5 +
subtle_boost +
happiness_noise,
1, 10
)
happiness = int(np.round(current_happiness))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,144 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness for this participant
person_happiness_baseline = np.random.normal(5.5, 1.2)
current_happiness = person_happiness_baseline
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Habit stacking: completing one habit makes the next easier
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
habit_boost = adherence_count * 1.2 if adherence_count > 0 else 0
# Happiness has persistence but is also affected by habits
happiness_noise = np.random.normal(0, 1.3)
current_happiness = np.clip(
current_happiness * 0.4 + # Previous day influences today
person_happiness_baseline * 0.4 +
habit_boost * 0.9 + # Habits have strong effect
happiness_noise,
1, 10
)
happiness = int(np.round(current_happiness))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
for day in DAYS:
# Control group only records happiness; all habit columns are No.
calendar = 'No'
clean = 'No'
ontime = 'No'
baseline_happiness = np.random.normal(5.5, 1.0)
control_noise = np.random.normal(0.0, 1.1)
happiness = int(np.clip(baseline_happiness + control_noise, 1, 10))
rows.append([
participant_id,
'Control',
day,
calendar,
clean,
ontime,
happiness,
])
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/Data%20Gen.py","entries":[{"id":"54EK.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345349390},{"id":"PpFf.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345378739},{"id":"cTNf.py","source":"Chat Edit: 'improve data gen to create more natural data'","timestamp":1774347044805},{"id":"WSl3.py","source":"Chat Edit: 'improve data gen to create more natural data'","timestamp":1774347057825},{"id":"9dqp.py","timestamp":1774347206509},{"id":"blt8.py","source":"Chat Edit: 'can you ensure the data shows an upward trend in happiness as the study goes on, and in direct correlation with the habits completed by that participant? at the moment, the intervention group is happier after a single day.'","timestamp":1774347345483},{"id":"3jGE.py","source":"Chat Edit: 'can you ensure the data shows an upward trend in happiness as the study goes on, and in direct correlation with the habits completed by that participant? at the moment, the intervention group is happier after a single day.'","timestamp":1774347365731},{"id":"sBVR.py","source":"Chat Edit: 'can you ensure the data shows an upward trend in happiness as the study goes on, and in direct correlation with the habits completed by that participant? at the moment, the intervention group is happier after a single day.'","timestamp":1774347432858},{"id":"j9Wc.py","source":"Chat Edit: 'the happiness results after 30 days seem a little too high, and this dose-response graph shows low happiness with all 3 completed. unusual. you could also increase participants to 40 control 40 intervention'","timestamp":1774347711480},{"id":"bLJN.py","source":"Chat Edit: 'the happiness results after 30 days seem a little too high, and this dose-response graph shows low happiness with all 3 completed. unusual. you could also increase participants to 40 control 40 intervention'","timestamp":1774347763541},{"id":"MJ5p.py","source":"Chat Edit: 'the happiness results after 30 days seem a little too high, and this dose-response graph shows low happiness with all 3 completed. unusual. you could also increase participants to 40 control 40 intervention'","timestamp":1774347783690},{"id":"EkUx.py","source":"Chat Edit: 'control mean is too low. realistically the control group should be around an average of 5.'","timestamp":1774347933805},{"id":"u91r.py","source":"Chat Edit: 'control mean is too low. realistically the control group should be around an average of 5.'","timestamp":1774347955983},{"id":"QTk6.py","timestamp":1774348022105},{"id":"o2Y7.py","timestamp":1774348397371},{"id":"46oA.py","timestamp":1774352345991}]}

View file

@ -0,0 +1,197 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 3)
habit_strength = np.clip(habit_strength, 0, 3)
# Happiness is baseline + growth from habit_strength over time
# As study progresses and habit_strength builds, happiness increases more
study_progress = day / 30.0 # 0.033 to 1.0 over 30 days
# Daily random noise (small)
daily_noise = np.random.normal(0, 0.5)
# Happiness formula: baseline + cumulative effect that strengthens over time
# More conservative multiplier to keep final happiness reasonable (6-8 range)
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
habit_strength * (0.4 + study_progress * 0.5) + # Habit benefits grow over time, max ~2.7
daily_noise # Day-to-day variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(4.0, 1.0) # Same baseline as intervention (no advantage)
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.2)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,200 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 5)
habit_strength = np.clip(habit_strength, 0, 5)
# Happiness combines DAILY habits effect + cumulative habit strength
study_progress = day / 30.0 # 0.033 to 1.0
daily_noise = np.random.normal(0, 0.35)
# Immediate bonus for today's habits (strong, clear dose-response)
daily_habit_bonus = adherence_count * 0.6 # 0-1.8 based on today's habits
# Cumulative bonus grows as study progresses
cumulative_bonus = habit_strength * (0.4 + study_progress * 0.2) # max ~2.7
# Happiness formula: baseline + daily effect + cumulative effect + noise
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
daily_habit_bonus + # Today's habits (0-1.8)
cumulative_bonus + # Study progress bonus (0-2.7)
daily_noise # Variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(5.1, 0.9) # Center control around ~5
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.0)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,196 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 20
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.8, 1.1) # Lower starting point for growth
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 4)
habit_strength = np.clip(habit_strength, 0, 4)
# Happiness is baseline + growth from habit_strength over time
# As study progresses and habit_strength builds, happiness increases more
study_progress = day / 30.0 # 0.033 to 1.0 over 30 days
# Daily random noise (small)
daily_noise = np.random.normal(0, 0.5)
# Happiness formula: baseline + cumulative effect that strengthens over time
happiness_value = (
person_happiness_baseline + # Starting point
habit_strength * (0.5 + study_progress) + # Habit benefits grow over time
daily_noise # Day-to-day variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(4.8, 1.3) # Same baseline as intervention (no advantage)
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.2)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,200 @@
import pandas as pd
import numpy as np
np.random.seed(42) # ensures you get exactly the same data every time
N_PARTICIPANTS_PER_GROUP = 40
DAYS = list(range(1, 31))
def clip_yes_prob(prob, ceiling):
return min(ceiling, max(0.05, prob))
def generate_intervention_group(start_participant_id=1):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
org_bias = np.random.normal(0.65, 0.18) # each person has their own organization tendency (persistent)
org_bias = np.clip(org_bias, 0.1, 0.95)
# Personal baselines for each habit (people are naturally better/worse at specific habits)
calendar_ease = org_bias + np.random.normal(0.05, 0.08)
clean_ease = org_bias + np.random.normal(-0.02, 0.08)
ontime_ease = org_bias + np.random.normal(0.02, 0.08)
# Baseline happiness and habit strength for this participant
person_happiness_baseline = np.random.normal(4.0, 1.0) # Starting point (4-5 range)
habit_strength = 0.0 # Cumulative measure of consistent habit completion
# Track previous day's habits for momentum/habit stacking
prev_calendar, prev_clean, prev_ontime = 'No', 'No', 'No'
for day in DAYS:
# Week effect: Sunday (day % 7 == 0) and Saturday (day % 7 == 6) have lower adherence
week_difficulty = 1.0 if (day % 7) not in [0, 6] else 0.75 # weekends are harder
# Habit formation/fatigue: early days harder, then easier, slight decline late
if day < 7:
time_factor = 0.85 # Getting started is harder
elif day < 20:
time_factor = 1.1 # Momentum builds
else:
time_factor = 0.98 # Slight fatigue
# Momentum effect: If you did a habit yesterday, you're more likely to do it today
calendar_prob = clip_yes_prob(
calendar_ease * week_difficulty * time_factor +
(0.15 if prev_calendar == 'Yes' else 0), 0.95
)
clean_prob = clip_yes_prob(
clean_ease * week_difficulty * time_factor +
(0.15 if prev_clean == 'Yes' else 0), 0.90
)
ontime_prob = clip_yes_prob(
ontime_ease * week_difficulty * time_factor +
(0.12 if prev_ontime == 'Yes' else 0), 0.93
)
calendar = np.random.choice(['Yes', 'No'], p=[calendar_prob, 1 - calendar_prob])
clean = np.random.choice(['Yes', 'No'], p=[clean_prob, 1 - clean_prob])
ontime = np.random.choice(['Yes', 'No'], p=[ontime_prob, 1 - ontime_prob])
# Count habits completed today
adherence_count = sum(x == 'Yes' for x in [calendar, clean, ontime])
# Habit strength: accumulates with consistent completion, decays with non-completion
# This creates a cumulative effect that drives upward trend
if adherence_count == 3:
habit_strength += 0.6 # Strong boost for completing all habits
elif adherence_count == 2:
habit_strength += 0.35 # Moderate boost
elif adherence_count == 1:
habit_strength += 0.15 # Small boost
else:
habit_strength -= 0.2 # Small decay for missing all habits
# Clip habit_strength to reasonable range (0 to 5)
habit_strength = np.clip(habit_strength, 0, 5)
# Happiness combines DAILY habits effect + cumulative habit strength
study_progress = day / 30.0 # 0.033 to 1.0
daily_noise = np.random.normal(0, 0.35)
# Immediate bonus for today's habits (strong, clear dose-response)
daily_habit_bonus = adherence_count * 0.6 # 0-1.8 based on today's habits
# Cumulative bonus grows as study progresses
cumulative_bonus = habit_strength * (0.4 + study_progress * 0.2) # max ~2.7
# Happiness formula: baseline + daily effect + cumulative effect + noise
happiness_value = (
person_happiness_baseline + # Starting point (4.0)
daily_habit_bonus + # Today's habits (0-1.8)
cumulative_bonus + # Study progress bonus (0-2.7)
daily_noise # Variability
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Intervention',
day,
calendar,
clean,
ontime,
happiness,
])
# Update for next iteration
prev_calendar, prev_clean, prev_ontime = calendar, clean, ontime
return rows
def generate_control_group(start_participant_id):
rows = []
for offset in range(N_PARTICIPANTS_PER_GROUP):
participant_id = start_participant_id + offset
# Even without tracking, some people are naturally more organized
natural_org = np.random.normal(0.3, 0.15) # Lower baseline than intervention
natural_org = np.clip(natural_org, 0.05, 0.7)
# Personal tendencies (but not tracked/reported as habits)
person_happiness_baseline = np.random.normal(5.1, 0.9) # Center control around ~5
# Since they're not tracking, habits happen at random intervals (not streaky)
prev_untracked_habits = 0
for day in DAYS:
# Week effect: sans the awareness/tracking effect
week_factor = 1.0 if (day % 7) not in [0, 6] else 0.9
# Without tracking, unaware of patterns, so less habit formation
time_factor = 1.0 + (day / 100) * 0.1 # Tiny habituation, but weak
# Untracked habits - they happen but aren't reported
calendar_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4),
1 - clip_yes_prob(natural_org * 0.8 * week_factor * time_factor, 0.4)])
clean_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35),
1 - clip_yes_prob(natural_org * 0.75 * week_factor * time_factor, 0.35)])
ontime_untracked = np.random.choice(['Yes', 'No'],
p=[clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45),
1 - clip_yes_prob(natural_org * 0.85 * week_factor * time_factor, 0.45)])
# They report habits as "No" (not tracking), but untracked habits have minimal effect
untracked_count = sum(x == 'Yes' for x in [calendar_untracked, clean_untracked, ontime_untracked])
subtle_boost = untracked_count * 0.1 # Tiny effect since unaware/untracked
# Control group happiness has day-to-day variability but no systematic growth
# Without awareness and tracking, there's no cumulative benefit
daily_noise = np.random.normal(0, 1.0)
happiness_value = (
person_happiness_baseline + # Same baseline
subtle_boost + # Minimal benefit from occasional habits
daily_noise # Higher variability, no systematic trend
)
happiness = int(np.clip(np.round(happiness_value), 1, 10))
rows.append([
participant_id,
'Control',
day,
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
'No', # Reported as "No" - not tracking
happiness,
])
prev_untracked_habits = untracked_count
return rows
data = []
data.extend(generate_intervention_group(start_participant_id=1))
data.extend(generate_control_group(start_participant_id=N_PARTICIPANTS_PER_GROUP + 1))
df = pd.DataFrame(
data,
columns=[
'Participant_ID',
'Group',
'Day',
'Calendar_Adherence',
'Cleanliness_Adherence',
'Punctuality_Adherence',
'Happiness',
],
)
# Save the combined dataset
df.to_csv('organization_happiness_study_data.csv', index=False)
print("✅ Full dataset saved as 'organization_happiness_study_data.csv' — open it in Excel!")
print(df.head(10)) # shows first 10 rows

View file

@ -0,0 +1,6 @@
pandas
numpy
matplotlib
seaborn
scipy
statsmodels

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/requirements.txt","entries":[{"id":"1nKM.txt","source":"Chat Edit: 'improve on this analysis script'","timestamp":1774345121245}]}

View file

@ -0,0 +1,284 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_theme(style='whitegrid', context='talk')
def finish_plot(filename):
plt.tight_layout()
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
if show_plots:
plt.show()
plt.close()
# 1) PRIMARY OUTCOME: Mean happiness by group with error bars and value labels
if 'Group' in df.columns:
plt.figure(figsize=(8, 6))
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
bars = plt.bar(
np.arange(len(summary)),
summary['mean'].values,
yerr=ci95.values,
color=['#A9B2C3', '#4E79A7'],
capsize=8,
edgecolor='black',
linewidth=1.2,
alpha=0.9
)
plt.xticks(np.arange(len(summary)), ['Control Group\n(No habits tracked)', 'Intervention Group\n(Daily habits tracked)'])
plt.title('Effect of Tracked Organization Habits on Happiness', pad=15, fontsize=14, fontweight='bold')
plt.ylabel('Mean Daily Happiness Score (1-10)', fontsize=12)
plt.ylim(1, 10)
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval - 0.8, f'{yval:.1f}', ha='center', va='center', color='white', fontweight='bold', fontsize=11)
finish_plot('01_primary_outcome_group_comparison.png')
# 2) DISTRIBUTIONS: Show overlap and variability in happiness scores
if 'Group' in df.columns:
plt.figure(figsize=(9, 6))
order = ['Control', 'Intervention']
sns.violinplot(
data=df, x='Group', y='Happiness', order=order,
inner='quartile', palette={'Control': '#E0E0E0', 'Intervention': '#B3CDE3'}, cut=0
)
sns.stripplot(
data=df, x='Group', y='Happiness', order=order,
color='black', alpha=0.12, jitter=0.25, size=3
)
plt.title('Distribution of Happiness Reports Over 30 Days', pad=15, fontsize=14, fontweight='bold')
plt.xlabel('Study Group', fontsize=12)
plt.ylabel('Happiness Score', fontsize=12)
plt.ylim(1, 10)
finish_plot('02_happiness_distribution_by_group.png')
# 3) LONGITUDINAL: Daily happiness trend across 30 days
if 'Group' in df.columns and 'Day' in df.columns:
plt.figure(figsize=(10, 6))
daily_mean = df.groupby(['Group', 'Day'])['Happiness'].mean().reset_index()
sns.lineplot(
data=daily_mean, x='Day', y='Happiness', hue='Group',
hue_order=['Control', 'Intervention'],
palette={'Control': '#7F7F7F', 'Intervention': '#D62728'},
marker='o', linewidth=2.5, markersize=6
)
plt.title('Longitudinal Daily Happiness Throughout the Study', pad=15, fontsize=14, fontweight='bold')
plt.xlabel('Day of Study (1-30)', fontsize=12)
plt.ylabel('Average Happiness', fontsize=12)
plt.ylim(1, 10)
plt.xticks(range(1, 31, 2))
plt.legend(title='', frameon=True, facecolor='white', fontsize=10)
finish_plot('03_longitudinal_trends.png')
# 4) DOSE-RESPONSE: In intervention group, does MORE habits = MORE happiness?
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
plt.figure(figsize=(9, 6))
sns.boxplot(
data=intervention_df, x='Habits_Count', y='Happiness',
color='#9ECAE1', width=0.6, fliersize=0
)
sns.stripplot(
data=intervention_df, x='Habits_Count', y='Happiness',
color='#2B5B84', alpha=0.3, jitter=0.2, size=4
)
plt.title('Dose-Response: Happiness by Number of Habits Completed', pad=15, fontsize=14, fontweight='bold')
plt.xlabel('Number of Requested Habits Completed That Day\n(Calendar + Clean Room + Punctual)', fontsize=11)
plt.ylabel('Happiness Score', fontsize=12)
plt.ylim(1, 10)
finish_plot('04_habit_dose_response.png')
# 5) HABIT COMPLETION RATES: Which habits were easiest to maintain?
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
adherence_rates.columns = ['Habit', 'Rate']
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
plt.figure(figsize=(8, 6))
bars = sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
plt.title('Which Habits Were Easiest to Keep?', pad=15, fontsize=14, fontweight='bold')
plt.xlabel('', fontsize=12)
plt.ylabel('Percentage of Days Completed', fontsize=12)
plt.ylim(0, 1.05)
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
for bar in bars.patches:
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
f"{bar.get_height()*100:.0f}%", ha='center', va='bottom', fontweight='bold', fontsize=10)
finish_plot('05_habit_completion_rates.png')
# 6) INDIVIDUAL VARIATION: Participant-level averages show broad effect
if 'Group' in df.columns:
plt.figure(figsize=(12, 6))
participant_avg = df.groupby(['Group', 'Participant_ID'])['Happiness'].mean().reset_index()
participant_avg = participant_avg.sort_values(['Group', 'Happiness'])
participant_avg['Order_Index'] = range(len(participant_avg))
for group, color in zip(['Control', 'Intervention'], ['#BDBDBD', '#4E79A7']):
group_data = participant_avg[participant_avg['Group'] == group]
plt.bar(group_data['Order_Index'], group_data['Happiness'], color=color, label=group, alpha=0.85, width=0.8)
plt.axhline(df[df['Group']=='Control']['Happiness'].mean(), color='#7F7F7F', linestyle='--', linewidth=2, label='Control Mean')
plt.axhline(df[df['Group']=='Intervention']['Happiness'].mean(), color='#2B5B84', linestyle='--', linewidth=2, label='Intervention Mean')
plt.title('Individual Average Happiness Across Study Participants', pad=15, fontsize=14, fontweight='bold')
plt.xlabel('Individual Participants (Sorted by Happiness Level)', fontsize=12)
plt.ylabel('Average Happiness Score', fontsize=12)
plt.xticks([])
plt.ylim(1, 10)
plt.legend(frameon=True, facecolor='white', fontsize=10, loc='upper left')
finish_plot('06_individual_participant_avgs.png')
logging.info('Saved study plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,227 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_style('whitegrid')
# Boxplot by Habits_Count
plt.figure(figsize=(9, 6))
sns.boxplot(data=df, x='Habits_Count', y='Happiness', color='#4C72B0')
plt.title('Daily Happiness by Number of Habits Completed')
plt.xlabel('Number of habits followed (03)')
plt.ylabel('Happiness (110)')
f1 = outdir / 'happiness_by_habits_box.png'
plt.tight_layout()
plt.savefig(f1)
if show_plots:
plt.show()
plt.close()
# Violin / jitter + regression
plt.figure(figsize=(9, 6))
sns.violinplot(data=df, x='Habits_Count', y='Happiness', inner=None, color='#55A868')
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
plt.title('Happiness distribution by Habits Completed')
f2 = outdir / 'happiness_by_habits_violin.png'
plt.tight_layout()
plt.savefig(f2)
if show_plots:
plt.show()
plt.close()
# Participant average bar
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
plt.figure(figsize=(12, 5))
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, color='#C44E52')
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
plt.xticks(range(len(participant_avg)), participant_avg.index, rotation=45)
plt.title('Average Happiness per Participant (sorted)')
f3 = outdir / 'participant_avg_happiness.png'
plt.tight_layout()
plt.savefig(f3)
if show_plots:
plt.show()
plt.close()
if 'Group' in df.columns:
plt.figure(figsize=(7, 5))
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', color='#8172B2')
plt.title('Mean Happiness by Group')
plt.ylabel('Average happiness')
f_group = outdir / 'happiness_by_group.png'
plt.tight_layout()
plt.savefig(f_group)
if show_plots:
plt.show()
plt.close()
# Scatter with linear fit
plt.figure(figsize=(9, 6))
if 'Group' in df.columns:
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
else:
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
f4 = outdir / 'happiness_vs_habits_regression.png'
plt.tight_layout()
plt.savefig(f4)
if show_plots:
plt.show()
plt.close()
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,227 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_style('whitegrid')
# Boxplot by Habits_Count
plt.figure(figsize=(9, 6))
sns.boxplot(data=df, x='Habits_Count', y='Happiness', color='#4C72B0')
plt.title('Daily Happiness by Number of Habits Completed')
plt.xlabel('Number of habits followed (03)')
plt.ylabel('Happiness (110)')
f1 = outdir / 'happiness_by_habits_box.png'
plt.tight_layout()
plt.savefig(f1)
if show_plots:
plt.show()
plt.close()
# Violin / jitter + regression
plt.figure(figsize=(9, 6))
sns.violinplot(data=df, x='Habits_Count', y='Happiness', inner=None, color='#55A868')
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
plt.title('Happiness distribution by Habits Completed')
f2 = outdir / 'happiness_by_habits_violin.png'
plt.tight_layout()
plt.savefig(f2)
if show_plots:
plt.show()
plt.close()
# Participant average bar
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
plt.figure(figsize=(12, 5))
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, color='#C44E52')
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
plt.xticks(range(len(participant_avg)), participant_avg.index.astype(str), rotation=45)
plt.title('Average Happiness per Participant (sorted)')
f3 = outdir / 'participant_avg_happiness.png'
plt.tight_layout()
plt.savefig(f3)
if show_plots:
plt.show()
plt.close()
if 'Group' in df.columns:
plt.figure(figsize=(7, 5))
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', color='#8172B2')
plt.title('Mean Happiness by Group')
plt.ylabel('Average happiness')
f_group = outdir / 'happiness_by_group.png'
plt.tight_layout()
plt.savefig(f_group)
if show_plots:
plt.show()
plt.close()
# Scatter with linear fit
plt.figure(figsize=(9, 6))
if 'Group' in df.columns:
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
else:
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
f4 = outdir / 'happiness_vs_habits_regression.png'
plt.tight_layout()
plt.savefig(f4)
if show_plots:
plt.show()
plt.close()
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,253 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_theme(style='whitegrid', context='talk')
def finish_plot(filename):
plt.tight_layout()
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
if show_plots:
plt.show()
plt.close()
# 1) Mean happiness by group with error bars
if 'Group' in df.columns:
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
plt.figure(figsize=(8, 6))
plt.bar(summary.index, summary['mean'], color=['#7A7A7A', '#2A9D8F'], yerr=ci95, capsize=6)
plt.title('Average Happiness by Group')
plt.xlabel('Study group')
plt.ylabel('Mean happiness score')
plt.ylim(0, 10)
finish_plot('01_mean_happiness_by_group.png')
# 2) Distribution of happiness by group
if 'Group' in df.columns:
plt.figure(figsize=(9, 6))
order = ['Control', 'Intervention']
sns.boxplot(data=df, x='Group', y='Happiness', order=order, color='#C9D1D9')
sns.stripplot(data=df, x='Group', y='Happiness', order=order, color='black', alpha=0.18, jitter=0.22, size=2)
plt.title('Happiness Distribution by Group')
plt.xlabel('Study group')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('02_happiness_distribution_by_group.png')
# 3) Daily happiness trend by group
if 'Group' in df.columns and 'Day' in df.columns:
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
plt.figure(figsize=(10, 6))
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
plt.title('Mean Daily Happiness Across the Study')
plt.xlabel('Day of study')
plt.ylabel('Average happiness')
plt.ylim(0, 10)
plt.xticks(range(1, 31, 2))
finish_plot('03_daily_happiness_trend.png')
# 4) Happiness by number of habits in intervention group only
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
plt.figure(figsize=(9, 6))
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
plt.title('Intervention Group: Happiness by Number of Habits Completed')
plt.xlabel('Habits completed that day')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('04_happiness_by_habits_intervention.png')
# 5) Mean happiness by habits count in intervention group
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
plt.figure(figsize=(8, 6))
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
plt.xlabel('Number of habits completed')
plt.ylabel('Mean happiness')
plt.xticks([0, 1, 2, 3])
plt.ylim(0, 10)
finish_plot('05_mean_happiness_by_habits.png')
# 6) Habit adherence rates in the intervention group
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
adherence_rates.columns = ['Habit', 'Rate']
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
plt.figure(figsize=(9, 6))
sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
plt.title('Intervention Group: Habit Completion Rate')
plt.xlabel('Habit')
plt.ylabel('Proportion completed')
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
finish_plot('06_habit_completion_rate.png')
# 7) Participant average happiness by group
if 'Group' in df.columns:
plt.figure(figsize=(12, 6))
participant_avg = df.groupby(['Group', 'Participant_ID'], as_index=False)['Happiness'].mean()
sns.boxplot(data=participant_avg, x='Group', y='Happiness', order=['Control', 'Intervention'], color='#D6D6D6')
sns.stripplot(data=participant_avg, x='Group', y='Happiness', order=['Control', 'Intervention'], color='black', alpha=0.45, jitter=0.12, size=5)
plt.title('Average Happiness per Participant')
plt.xlabel('Study group')
plt.ylabel('Participant mean happiness')
plt.ylim(0, 10)
finish_plot('07_participant_average_happiness.png')
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,272 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_theme(style='whitegrid', context='talk')
def finish_plot(filename):
plt.tight_layout()
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
if show_plots:
plt.show()
plt.close()
# 1) Mean happiness by group with error bars
if 'Group' in df.columns:
plt.figure(figsize=(8, 6))
order = ['Control', 'Intervention']
sns.barplot(
data=df,
x='Group',
y='Happiness',
order=order,
estimator='mean',
errorbar=('ci', 95),
palette=['#7A7A7A', '#2A9D8F'],
)
plt.title('Average Happiness by Group')
plt.xlabel('Study group')
plt.ylabel('Mean happiness score')
plt.ylim(0, 10)
finish_plot('01_mean_happiness_by_group.png')
# 2) Distribution of happiness by group
if 'Group' in df.columns:
plt.figure(figsize=(9, 6))
order = ['Control', 'Intervention']
sns.boxplot(data=df, x='Group', y='Happiness', order=order, palette=['#B0B0B0', '#73C6B6'])
sns.stripplot(data=df, x='Group', y='Happiness', order=order, color='black', alpha=0.18, jitter=0.22, size=2)
plt.title('Happiness Distribution by Group')
plt.xlabel('Study group')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('02_happiness_distribution_by_group.png')
# 3) Daily happiness trend by group
if 'Group' in df.columns and 'Day' in df.columns:
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
plt.figure(figsize=(10, 6))
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
plt.title('Mean Daily Happiness Across the Study')
plt.xlabel('Day of study')
plt.ylabel('Average happiness')
plt.ylim(0, 10)
plt.xticks(range(1, 31, 2))
finish_plot('03_daily_happiness_trend.png')
# 4) Happiness by number of habits in intervention group only
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
plt.figure(figsize=(9, 6))
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
plt.title('Intervention Group: Happiness by Number of Habits Completed')
plt.xlabel('Habits completed that day')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('04_happiness_by_habits_intervention.png')
# 5) Mean happiness by habits count in intervention group
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
plt.figure(figsize=(8, 6))
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
plt.xlabel('Number of habits completed')
plt.ylabel('Mean happiness')
plt.xticks([0, 1, 2, 3])
plt.ylim(0, 10)
finish_plot('05_mean_happiness_by_habits.png')
# 6) Habit adherence rates in the intervention group
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
adherence_rates = (
intervention_df[habit_cols]
.mean()
.sort_values(ascending=False)
.reset_index()
.rename(columns={'index': 'Habit', 0: 'Rate'})
)
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
plt.figure(figsize=(9, 6))
sns.barplot(data=adherence_rates, x='Habit', y=0, color='#E76F51')
plt.title('Intervention Group: Habit Completion Rate')
plt.xlabel('Habit')
plt.ylabel('Proportion completed')
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
finish_plot('06_habit_completion_rate.png')
# 7) Participant averages, grouped by study group
if 'Group' in df.columns:
participant_avg = df.groupby(['Group', 'Participant_ID'])['Happiness'].mean().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(
data=participant_avg,
x='Participant_ID',
y='Happiness',
hue='Group',
dodge=True,
palette=['#7A7A7A', '#2A9D8F'],
)
plt.title('Average Happiness per Participant')
plt.xlabel('Participant ID')
plt.ylabel('Mean happiness')
plt.ylim(0, 10)
plt.xticks(rotation=45)
finish_plot('07_participant_average_happiness.png')
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,270 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_theme(style='whitegrid', context='talk')
def finish_plot(filename):
plt.tight_layout()
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
if show_plots:
plt.show()
plt.close()
# 1) Mean happiness by group with error bars
if 'Group' in df.columns:
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
plt.figure(figsize=(8, 6))
xpos = np.arange(len(summary))
plt.bar(xpos, summary['mean'].values, color=['#7A7A7A', '#2A9D8F'], yerr=ci95.values, capsize=6)
plt.xticks(xpos, summary.index)
plt.title('Average Happiness by Group')
plt.xlabel('Study group')
plt.ylabel('Mean happiness score')
plt.ylim(0, 10)
finish_plot('01_mean_happiness_by_group.png')
# 2) Distribution of happiness by group
if 'Group' in df.columns:
plt.figure(figsize=(9, 6))
order = ['Control', 'Intervention']
grouped = [df.loc[df['Group'] == group, 'Happiness'].values for group in order]
plt.boxplot(grouped, labels=order, patch_artist=True,
boxprops=dict(facecolor='#C9D1D9', color='#4C4C4C'),
medianprops=dict(color='#2A9D8F', linewidth=2),
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
for i, group in enumerate(order, start=1):
y = df.loc[df['Group'] == group, 'Happiness'].values
x = np.random.normal(i, 0.06, size=len(y))
plt.scatter(x, y, color='black', alpha=0.15, s=10)
plt.title('Happiness Distribution by Group')
plt.xlabel('Study group')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('02_happiness_distribution_by_group.png')
# 3) Daily happiness trend by group
if 'Group' in df.columns and 'Day' in df.columns:
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
plt.figure(figsize=(10, 6))
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
plt.title('Mean Daily Happiness Across the Study')
plt.xlabel('Day of study')
plt.ylabel('Average happiness')
plt.ylim(0, 10)
plt.xticks(range(1, 31, 2))
finish_plot('03_daily_happiness_trend.png')
# 4) Happiness by number of habits in intervention group only
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
plt.figure(figsize=(9, 6))
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
plt.title('Intervention Group: Happiness by Number of Habits Completed')
plt.xlabel('Habits completed that day')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('04_happiness_by_habits_intervention.png')
# 5) Mean happiness by habits count in intervention group
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
plt.figure(figsize=(8, 6))
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
plt.xlabel('Number of habits completed')
plt.ylabel('Mean happiness')
plt.xticks([0, 1, 2, 3])
plt.ylim(0, 10)
finish_plot('05_mean_happiness_by_habits.png')
# 6) Habit adherence rates in the intervention group
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
adherence_rates.columns = ['Habit', 'Rate']
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
plt.figure(figsize=(9, 6))
sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
plt.title('Intervention Group: Habit Completion Rate')
plt.xlabel('Habit')
plt.ylabel('Proportion completed')
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
finish_plot('06_habit_completion_rate.png')
# 7) Participant average happiness by group
if 'Group' in df.columns:
plt.figure(figsize=(12, 6))
participant_avg = df.groupby(['Group', 'Participant_ID'], as_index=False)['Happiness'].mean()
group_order = ['Control', 'Intervention']
grouped_avgs = [participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values for group in group_order]
plt.boxplot(grouped_avgs, labels=group_order, patch_artist=True,
boxprops=dict(facecolor='#D6D6D6', color='#4C4C4C'),
medianprops=dict(color='#2A9D8F', linewidth=2),
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
for i, group in enumerate(group_order, start=1):
y = participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values
x = np.random.normal(i, 0.06, size=len(y))
plt.scatter(x, y, color='black', alpha=0.45, s=22)
plt.title('Average Happiness per Participant')
plt.xlabel('Study group')
plt.ylabel('Participant mean happiness')
plt.ylim(0, 10)
finish_plot('07_participant_average_happiness.png')
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,189 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness):')
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(df.loc[mask, habit].astype(int), df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_style('whitegrid')
# Boxplot by Habits_Count
plt.figure(figsize=(9, 6))
sns.boxplot(x='Habits_Count', y='Happiness', data=df, palette='viridis')
plt.title('Daily Happiness by Number of Habits Completed')
plt.xlabel('Number of habits followed (03)')
plt.ylabel('Happiness (110)')
f1 = outdir / 'happiness_by_habits_box.png'
plt.tight_layout()
plt.savefig(f1)
if show_plots:
plt.show()
plt.close()
# Violin / jitter + regression
plt.figure(figsize=(9, 6))
sns.violinplot(x='Habits_Count', y='Happiness', data=df, inner=None, palette='muted')
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
plt.title('Happiness distribution by Habits Completed')
f2 = outdir / 'happiness_by_habits_violin.png'
plt.tight_layout()
plt.savefig(f2)
if show_plots:
plt.show()
plt.close()
# Participant average bar
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
plt.figure(figsize=(12, 5))
sns.barplot(x=participant_avg.index.astype(str), y=participant_avg.values, palette='coolwarm')
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
plt.xticks(rotation=45)
plt.title('Average Happiness per Participant (sorted)')
f3 = outdir / 'participant_avg_happiness.png'
plt.tight_layout()
plt.savefig(f3)
if show_plots:
plt.show()
plt.close()
# Scatter with linear fit
plt.figure(figsize=(9, 6))
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
f4 = outdir / 'happiness_vs_habits_regression.png'
plt.tight_layout()
plt.savefig(f4)
if show_plots:
plt.show()
plt.close()
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect size example: compare 0 vs 3
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,231 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_style('whitegrid')
# Boxplot by Habits_Count
plt.figure(figsize=(9, 6))
sns.boxplot(data=df, x='Habits_Count', y='Happiness', hue='Habits_Count', palette='viridis', dodge=False)
plt.legend([], [], frameon=False)
plt.title('Daily Happiness by Number of Habits Completed')
plt.xlabel('Number of habits followed (03)')
plt.ylabel('Happiness (110)')
f1 = outdir / 'happiness_by_habits_box.png'
plt.tight_layout()
plt.savefig(f1)
if show_plots:
plt.show()
plt.close()
# Violin / jitter + regression
plt.figure(figsize=(9, 6))
sns.violinplot(data=df, x='Habits_Count', y='Happiness', hue='Habits_Count', inner=None, palette='muted', dodge=False)
plt.legend([], [], frameon=False)
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
plt.title('Happiness distribution by Habits Completed')
f2 = outdir / 'happiness_by_habits_violin.png'
plt.tight_layout()
plt.savefig(f2)
if show_plots:
plt.show()
plt.close()
# Participant average bar
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
plt.figure(figsize=(12, 5))
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, hue=range(len(participant_avg)), palette='coolwarm', dodge=False)
plt.legend([], [], frameon=False)
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
plt.xticks(range(len(participant_avg)), participant_avg.index.astype(str), rotation=45)
plt.title('Average Happiness per Participant (sorted)')
f3 = outdir / 'participant_avg_happiness.png'
plt.tight_layout()
plt.savefig(f3)
if show_plots:
plt.show()
plt.close()
if 'Group' in df.columns:
plt.figure(figsize=(7, 5))
sns.barplot(data=df, x='Group', y='Happiness', hue='Group', estimator='mean', errorbar='sd', palette='Set2', dodge=False)
plt.legend([], [], frameon=False)
plt.title('Mean Happiness by Group')
plt.ylabel('Average happiness')
f_group = outdir / 'happiness_by_group.png'
plt.tight_layout()
plt.savefig(f_group)
if show_plots:
plt.show()
plt.close()
# Scatter with linear fit
plt.figure(figsize=(9, 6))
if 'Group' in df.columns:
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
else:
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
f4 = outdir / 'happiness_vs_habits_regression.png'
plt.tight_layout()
plt.savefig(f4)
if show_plots:
plt.show()
plt.close()
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,270 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_theme(style='whitegrid', context='talk')
def finish_plot(filename):
plt.tight_layout()
plt.savefig(outdir / filename, dpi=200, bbox_inches='tight')
if show_plots:
plt.show()
plt.close()
# 1) Mean happiness by group with error bars
if 'Group' in df.columns:
summary = df.groupby('Group')['Happiness'].agg(['mean', 'std', 'count']).reindex(['Control', 'Intervention'])
ci95 = 1.96 * (summary['std'] / np.sqrt(summary['count']))
plt.figure(figsize=(8, 6))
xpos = np.arange(len(summary))
plt.bar(xpos, summary['mean'].values, color=['#7A7A7A', '#2A9D8F'], yerr=ci95.values, capsize=6)
plt.xticks(xpos, summary.index)
plt.title('Average Happiness by Group')
plt.xlabel('Study group')
plt.ylabel('Mean happiness score')
plt.ylim(0, 10)
finish_plot('01_mean_happiness_by_group.png')
# 2) Distribution of happiness by group
if 'Group' in df.columns:
plt.figure(figsize=(9, 6))
order = ['Control', 'Intervention']
grouped = [df.loc[df['Group'] == group, 'Happiness'].values for group in order]
plt.boxplot(grouped, tick_labels=order, patch_artist=True,
boxprops=dict(facecolor='#C9D1D9', color='#4C4C4C'),
medianprops=dict(color='#2A9D8F', linewidth=2),
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
for i, group in enumerate(order, start=1):
y = df.loc[df['Group'] == group, 'Happiness'].values
x = np.random.normal(i, 0.06, size=len(y))
plt.scatter(x, y, color='black', alpha=0.15, s=10)
plt.title('Happiness Distribution by Group')
plt.xlabel('Study group')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('02_happiness_distribution_by_group.png')
# 3) Daily happiness trend by group
if 'Group' in df.columns and 'Day' in df.columns:
daily = df.groupby(['Group', 'Day'], as_index=False)['Happiness'].mean()
plt.figure(figsize=(10, 6))
sns.lineplot(data=daily, x='Day', y='Happiness', hue='Group', hue_order=['Control', 'Intervention'], marker='o')
plt.title('Mean Daily Happiness Across the Study')
plt.xlabel('Day of study')
plt.ylabel('Average happiness')
plt.ylim(0, 10)
plt.xticks(range(1, 31, 2))
finish_plot('03_daily_happiness_trend.png')
# 4) Happiness by number of habits in intervention group only
intervention_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
plt.figure(figsize=(9, 6))
sns.boxplot(data=intervention_df, x='Habits_Count', y='Happiness', color='#4C72B0')
sns.stripplot(data=intervention_df, x='Habits_Count', y='Happiness', color='black', alpha=0.20, jitter=0.18, size=2)
plt.title('Intervention Group: Happiness by Number of Habits Completed')
plt.xlabel('Habits completed that day')
plt.ylabel('Happiness score')
plt.ylim(0, 10)
finish_plot('04_happiness_by_habits_intervention.png')
# 5) Mean happiness by habits count in intervention group
habits_mean = intervention_df.groupby('Habits_Count', as_index=False)['Happiness'].mean()
plt.figure(figsize=(8, 6))
sns.lineplot(data=habits_mean, x='Habits_Count', y='Happiness', marker='o', color='#1F77B4')
plt.title('Intervention Group: Mean Happiness vs Habits Completed')
plt.xlabel('Number of habits completed')
plt.ylabel('Mean happiness')
plt.xticks([0, 1, 2, 3])
plt.ylim(0, 10)
finish_plot('05_mean_happiness_by_habits.png')
# 6) Habit adherence rates in the intervention group
habit_cols = ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']
adherence_rates = intervention_df[habit_cols].mean().sort_values(ascending=False).reset_index()
adherence_rates.columns = ['Habit', 'Rate']
adherence_rates['Habit'] = adherence_rates['Habit'].str.replace('_Adherence', '', regex=False)
plt.figure(figsize=(9, 6))
sns.barplot(data=adherence_rates, x='Habit', y='Rate', color='#E76F51')
plt.title('Intervention Group: Habit Completion Rate')
plt.xlabel('Habit')
plt.ylabel('Proportion completed')
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1.0))
finish_plot('06_habit_completion_rate.png')
# 7) Participant average happiness by group
if 'Group' in df.columns:
plt.figure(figsize=(12, 6))
participant_avg = df.groupby(['Group', 'Participant_ID'], as_index=False)['Happiness'].mean()
group_order = ['Control', 'Intervention']
grouped_avgs = [participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values for group in group_order]
plt.boxplot(grouped_avgs, tick_labels=group_order, patch_artist=True,
boxprops=dict(facecolor='#D6D6D6', color='#4C4C4C'),
medianprops=dict(color='#2A9D8F', linewidth=2),
whiskerprops=dict(color='#4C4C4C'), capprops=dict(color='#4C4C4C'))
for i, group in enumerate(group_order, start=1):
y = participant_avg.loc[participant_avg['Group'] == group, 'Happiness'].values
x = np.random.normal(i, 0.06, size=len(y))
plt.scatter(x, y, color='black', alpha=0.45, s=22)
plt.title('Average Happiness per Participant')
plt.xlabel('Study group')
plt.ylabel('Participant mean happiness')
plt.ylim(0, 10)
finish_plot('07_participant_average_happiness.png')
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1 @@
{"version":1,"resource":"file:///home/breadway/Documents/Year%2010/Year%2010/Psychology/Data%20Analysis.py","entries":[{"id":"SA9R.py","source":"Chat Edit: 'improve on this analysis script'","timestamp":1774345116327},{"id":"ycv3.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345356264},{"id":"bwYb.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345411358},{"id":"Gx76.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345436946},{"id":"FOyN.py","source":"Chat Edit: 'improve data gen.py to add a second dataset as a control. for context, the study tracks the affects of being organised on how happy participants feel. there needs to be a control group that is only recording their happiness daily. the main group will try to record their happiness, will add all events to their calendar, be on time to every event, and clean their bedroom everyday. they report if they do any of these in the study data as a yes or no. the control group will not do any of these.'","timestamp":1774345501736},{"id":"MtI5.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346145201},{"id":"Ldgu.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346200970},{"id":"NtsI.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346222014},{"id":"enQE.py","source":"Chat Edit: 'make the graphs better suited to the study, easier to read, and more graphs.'","timestamp":1774346258056},{"id":"yfjL.py","timestamp":1774346751804},{"id":"9KVj.py","source":"Chat Edit: 'ensure the graphs being used are appropriate for the study'","timestamp":1774346803522}]}

View file

@ -0,0 +1,227 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_style('whitegrid')
# Boxplot by Habits_Count
plt.figure(figsize=(9, 6))
sns.boxplot(x='Habits_Count', y='Happiness', data=df, palette='viridis')
plt.title('Daily Happiness by Number of Habits Completed')
plt.xlabel('Number of habits followed (03)')
plt.ylabel('Happiness (110)')
f1 = outdir / 'happiness_by_habits_box.png'
plt.tight_layout()
plt.savefig(f1)
if show_plots:
plt.show()
plt.close()
# Violin / jitter + regression
plt.figure(figsize=(9, 6))
sns.violinplot(x='Habits_Count', y='Happiness', data=df, inner=None, palette='muted')
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
plt.title('Happiness distribution by Habits Completed')
f2 = outdir / 'happiness_by_habits_violin.png'
plt.tight_layout()
plt.savefig(f2)
if show_plots:
plt.show()
plt.close()
# Participant average bar
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
plt.figure(figsize=(12, 5))
sns.barplot(x=participant_avg.index.astype(str), y=participant_avg.values, palette='coolwarm')
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
plt.xticks(rotation=45)
plt.title('Average Happiness per Participant (sorted)')
f3 = outdir / 'participant_avg_happiness.png'
plt.tight_layout()
plt.savefig(f3)
if show_plots:
plt.show()
plt.close()
if 'Group' in df.columns:
plt.figure(figsize=(7, 5))
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', palette='Set2')
plt.title('Mean Happiness by Group')
plt.ylabel('Average happiness')
f_group = outdir / 'happiness_by_group.png'
plt.tight_layout()
plt.savefig(f_group)
if show_plots:
plt.show()
plt.close()
# Scatter with linear fit
plt.figure(figsize=(9, 6))
if 'Group' in df.columns:
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
else:
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
f4 = outdir / 'happiness_vs_habits_regression.png'
plt.tight_layout()
plt.savefig(f4)
if show_plots:
plt.show()
plt.close()
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,227 @@
import argparse
import os
from pathlib import Path
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_data(path):
df = pd.read_csv(path)
logging.info("Loaded %d rows from %s", len(df), path)
return df
def prepare_data(df):
# Ensure required columns exist
required = {'Participant_ID', 'Happiness', 'Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence'}
missing = required - set(df.columns)
if missing:
raise KeyError(f"Missing required columns: {missing}")
if 'Group' not in df.columns:
df['Group'] = 'Intervention'
df['Group'] = df['Group'].astype(str).str.strip().str.title()
# Normalize adherence to boolean (Yes/No or True/False)
for col in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
df[col] = df[col].astype(str).str.strip().str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})
# Count habits per row
df['Habits_Count'] = (
df[['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']].fillna(False).astype(int).sum(axis=1)
)
# Coerce Happiness to numeric and drop rows without Happiness
df['Happiness'] = pd.to_numeric(df['Happiness'], errors='coerce')
before = len(df)
df = df.dropna(subset=['Happiness'])
logging.info('Dropped %d rows without numeric Happiness', before - len(df))
return df
def descriptive_stats(df):
print('Dataset shape:', df.shape)
print('\nOverall summary:')
print(df['Happiness'].describe())
if 'Group' in df.columns:
print('\nRows by group:')
print(df['Group'].value_counts())
print('\nAverage happiness by group:')
print(df.groupby('Group')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nAverage happiness by number of habits completed:')
print(df.groupby('Habits_Count')['Happiness'].agg(['mean', 'count', 'std']).round(3))
print('\nMedian happiness by habits:')
print(df.groupby('Habits_Count')['Happiness'].median())
# Correlations
print('\nPearson correlation between Habits_Count and Happiness:')
print(df[['Habits_Count', 'Happiness']].corr().round(3))
print('\nPoint-biserial correlation (each habit vs happiness, intervention group only):')
habit_df = df[df['Group'] == 'Intervention'] if 'Group' in df.columns else df
for habit in ['Calendar_Adherence', 'Cleanliness_Adherence', 'Punctuality_Adherence']:
mask = ~habit_df[habit].isna()
if mask.sum() == 0:
print(f'{habit:22} (no data)')
continue
r, p = stats.pointbiserialr(habit_df.loc[mask, habit].astype(int), habit_df.loc[mask, 'Happiness'])
print(f"{habit:22} r = {r:.3f} p = {p:.4f}")
def cohen_d(x, y):
# Cohen's d for two independent samples
nx, ny = len(x), len(y)
dof = nx + ny - 2
pooled_sd = np.sqrt(((nx - 1) * x.std(ddof=1) ** 2 + (ny - 1) * y.std(ddof=1) ** 2) / dof)
return (x.mean() - y.mean()) / pooled_sd
def run_ols(df):
if 'Group' in df.columns:
model = smf.ols('Happiness ~ Habits_Count + C(Group)', data=df).fit()
print('\nOLS regression: Happiness ~ Habits_Count + Group')
else:
X = sm.add_constant(df['Habits_Count'])
y = df['Happiness']
model = sm.OLS(y, X).fit()
print('\nSimple OLS regression: Happiness ~ Habits_Count')
print(model.summary())
return model
def run_mixedlm(df):
# Random intercept for Participant_ID
try:
md = smf.mixedlm('Happiness ~ Habits_Count', data=df, groups=df['Participant_ID'])
mdf = md.fit(reml=False)
print('\nMixed-effects model (random intercept by Participant_ID):')
print(mdf.summary())
return mdf
except Exception as e:
logging.warning('MixedLM failed: %s', e)
return None
def make_plots(df, outdir, show_plots=False):
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
sns.set_style('whitegrid')
# Boxplot by Habits_Count
plt.figure(figsize=(9, 6))
sns.boxplot(data=df, x='Habits_Count', y='Happiness', color='#4C72B0')
plt.title('Daily Happiness by Number of Habits Completed')
plt.xlabel('Number of habits followed (03)')
plt.ylabel('Happiness (110)')
f1 = outdir / 'happiness_by_habits_box.png'
plt.tight_layout()
plt.savefig(f1)
if show_plots:
plt.show()
plt.close()
# Violin / jitter + regression
plt.figure(figsize=(9, 6))
sns.violinplot(data=df, x='Habits_Count', y='Happiness', inner=None, color='#55A868')
sns.stripplot(x='Habits_Count', y='Happiness', data=df, color='k', alpha=0.3, jitter=0.15)
plt.title('Happiness distribution by Habits Completed')
f2 = outdir / 'happiness_by_habits_violin.png'
plt.tight_layout()
plt.savefig(f2)
if show_plots:
plt.show()
plt.close()
# Participant average bar
participant_avg = df.groupby('Participant_ID')['Happiness'].mean().sort_values()
plt.figure(figsize=(12, 5))
sns.barplot(x=range(len(participant_avg)), y=participant_avg.values, color='#C44E52')
plt.axhline(df['Happiness'].mean(), color='black', linestyle='--', alpha=0.6)
plt.xticks(range(len(participant_avg)), participant_avg.index, rotation=45)
plt.title('Average Happiness per Participant (sorted)')
f3 = outdir / 'participant_avg_happiness.png'
plt.tight_layout()
plt.savefig(f3)
if show_plots:
plt.show()
plt.close()
if 'Group' in df.columns:
plt.figure(figsize=(7, 5))
sns.barplot(data=df, x='Group', y='Happiness', estimator='mean', errorbar='sd', color='#8172B2')
plt.title('Mean Happiness by Group')
plt.ylabel('Average happiness')
f_group = outdir / 'happiness_by_group.png'
plt.tight_layout()
plt.savefig(f_group)
if show_plots:
plt.show()
plt.close()
# Scatter with linear fit
plt.figure(figsize=(9, 6))
if 'Group' in df.columns:
sns.scatterplot(data=df, x='Habits_Count', y='Happiness', hue='Group', alpha=0.35)
else:
sns.regplot(x='Habits_Count', y='Happiness', data=df, x_jitter=0.18, scatter_kws={'alpha': 0.4})
plt.title('Happiness vs Number of Habits Completed (with linear fit)')
f4 = outdir / 'happiness_vs_habits_regression.png'
plt.tight_layout()
plt.savefig(f4)
if show_plots:
plt.show()
plt.close()
logging.info('Saved plots to %s', outdir)
def main(args):
df = load_data(args.data)
df = prepare_data(df)
descriptive_stats(df)
# Effect sizes
group0 = df[df['Habits_Count'] == 0]['Happiness']
group3 = df[df['Habits_Count'] == 3]['Happiness']
if len(group0) > 1 and len(group3) > 1:
d = cohen_d(group3, group0)
print(f"\nCohen's d (3 habits vs 0 habits) = {d:.3f}")
if 'Group' in df.columns:
control = df[df['Group'] == 'Control']['Happiness']
intervention = df[df['Group'] == 'Intervention']['Happiness']
if len(control) > 1 and len(intervention) > 1:
d_group = cohen_d(intervention, control)
print(f"Cohen's d (Intervention vs Control happiness) = {d_group:.3f}")
# Models
run_ols(df)
run_mixedlm(df)
# Plots
make_plots(df, args.outdir, show_plots=args.show)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Improved data analysis for organization_happiness_study_data.csv')
parser.add_argument('--data', type=str, default='organization_happiness_study_data.csv', help='CSV data path')
parser.add_argument('--outdir', type=str, default='plots', help='Directory to save plots')
parser.add_argument('--show', action='store_true', help='Show plots interactively')
args = parser.parse_args()
main(args)