import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r"C:\Users\arici\OneDrive\Desktop\Data Science\ds-studio-ii\Project 1\instagram_usage_lifestyle.csv")

df.head()

df.columns

Index(['user_id', 'app_name', 'age', 'gender', 'country', 'urban_rural',
       'income_level', 'employment_status', 'education_level',
       'relationship_status', 'has_children', 'exercise_hours_per_week',
       'sleep_hours_per_night', 'diet_quality', 'smoking', 'alcohol_frequency',
       'perceived_stress_score', 'self_reported_happiness', 'body_mass_index',
       'blood_pressure_systolic', 'blood_pressure_diastolic',
       'daily_steps_count', 'weekly_work_hours', 'hobbies_count',
       'social_events_per_month', 'books_read_per_year',
       'volunteer_hours_per_month', 'travel_frequency_per_year',
       'daily_active_minutes_instagram', 'sessions_per_day',
       'posts_created_per_week', 'reels_watched_per_day',
       'stories_viewed_per_day', 'likes_given_per_day',
       'comments_written_per_day', 'dms_sent_per_week',
       'dms_received_per_week', 'ads_viewed_per_day', 'ads_clicked_per_day',
       'time_on_feed_per_day', 'time_on_explore_per_day',
       'time_on_messages_per_day', 'time_on_reels_per_day', 'followers_count',
       'following_count', 'uses_premium_features',
       'notification_response_rate', 'account_creation_year',
       'last_login_date', 'average_session_length_minutes',
       'content_type_preference', 'preferred_content_theme',
       'privacy_setting_level', 'two_factor_auth_enabled',
       'biometric_login_used', 'linked_accounts_count', 'subscription_status',
       'user_engagement_score'],
      dtype='object')

#Delete unneccsary columuns
df = df.drop(columns=["app_name","urban_rural", 'education_level', 'smoking', 'alcohol_frequency', 'body_mass_index',
       'blood_pressure_systolic', 'blood_pressure_diastolic', 'daily_steps_count', 
       'social_events_per_month', 'books_read_per_year','volunteer_hours_per_month', 
       'travel_frequency_per_year', 'sessions_per_day','posts_created_per_week', 
       'reels_watched_per_day', 'stories_viewed_per_day', 'likes_given_per_day',
       'comments_written_per_day', 'dms_sent_per_week','dms_received_per_week', 
       'ads_viewed_per_day', 'ads_clicked_per_day','time_on_feed_per_day', 
       'time_on_explore_per_day', 'time_on_messages_per_day', 'time_on_reels_per_day', 
       'followers_count', 'following_count', 'uses_premium_features',
       'notification_response_rate', 'account_creation_year', 'last_login_date', 
       'average_session_length_minutes', 'content_type_preference', 'preferred_content_theme',
       'privacy_setting_level', 'two_factor_auth_enabled', 'biometric_login_used', 
       'linked_accounts_count', 'subscription_status', 'user_engagement_score',
       'country', 'diet_quality', 'user_id'])

df.columns

Index(['age', 'gender', 'income_level', 'employment_status',
       'relationship_status', 'has_children', 'exercise_hours_per_week',
       'sleep_hours_per_night', 'perceived_stress_score',
       'self_reported_happiness', 'weekly_work_hours', 'hobbies_count',
       'daily_active_minutes_instagram'],
      dtype='object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1547896 entries, 0 to 1547895
Data columns (total 13 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   age                             1547896 non-null  int64  
 1   gender                          1547896 non-null  object 
 2   income_level                    1547896 non-null  object 
 3   employment_status               1547896 non-null  object 
 4   relationship_status             1547896 non-null  object 
 5   has_children                    1547896 non-null  object 
 6   exercise_hours_per_week         1547896 non-null  float64
 7   sleep_hours_per_night           1547896 non-null  float64
 8   perceived_stress_score          1547896 non-null  int64  
 9   self_reported_happiness         1547896 non-null  int64  
 10  weekly_work_hours               1547896 non-null  float64
 11  hobbies_count                   1547896 non-null  int64  
 12  daily_active_minutes_instagram  1547896 non-null  float64
dtypes: float64(4), int64(4), object(5)
memory usage: 153.5+ MB

#Getting rid of duplicates
df[['age', 'gender', 'income_level', 'employment_status',
       'relationship_status', 'has_children', 'exercise_hours_per_week',
       'sleep_hours_per_night', 'perceived_stress_score',
       'self_reported_happiness', 'weekly_work_hours', 'hobbies_count',
       'daily_active_minutes_instagram']].duplicated(keep=False).sum()

np.int64(0)

#There are no duplicates!?
duplicates = df[df[['age', 'gender', 'income_level', 'employment_status',
       'relationship_status', 'has_children', 'exercise_hours_per_week',
       'sleep_hours_per_night', 'perceived_stress_score',
       'self_reported_happiness', 'weekly_work_hours', 'hobbies_count',
       'daily_active_minutes_instagram']].duplicated(keep=False)]
duplicates.sort_values(by='age')

#Check for missing data
df.isnull().sum()

age                               0
gender                            0
income_level                      0
employment_status                 0
relationship_status               0
has_children                      0
exercise_hours_per_week           0
sleep_hours_per_night             0
perceived_stress_score            0
self_reported_happiness           0
weekly_work_hours                 0
hobbies_count                     0
daily_active_minutes_instagram    0
dtype: int64

#Sort data based off how much 
dfs = df.sort_values(by="daily_active_minutes_instagram")
print(dfs)

         age  gender  income_level   employment_status relationship_status  \
12        36  Female        Middle  Full-time employed              Single   
434555    34    Male        Middle  Full-time employed            Divorced   
434539    61  Female          High          Freelancer             Married   
1547866   17    Male           Low  Full-time employed   In a relationship   
1547863   28  Female  Lower-middle          Freelancer   In a relationship   
...      ...     ...           ...                 ...                 ...   
993357    18    Male          High  Full-time employed             Married   
116353    15    Male        Middle             Student              Single   
1141354   13    Male          High          Freelancer              Single   
1281062   21  Female        Middle          Freelancer   In a relationship   
253795    22    Male  Lower-middle             Student   In a relationship   

        has_children  exercise_hours_per_week  sleep_hours_per_night  \
12               Yes                     10.1                    6.6   
434555           Yes                      3.1                    8.2   
434539            No                      4.3                    7.7   
1547866           No                      3.7                    8.1   
1547863           No                      7.2                    6.0   
...              ...                      ...                    ...   
993357            No                      6.8                    8.2   
116353            No                      7.3                    7.5   
1141354          Yes                      2.1                    9.4   
1281062           No                      6.5                    5.8   
253795           Yes                      8.2                    5.7   

         perceived_stress_score  self_reported_happiness  weekly_work_hours  \
12                            7                       10               34.7   
434555                        3                        8               43.9   
434539                        4                        6               27.0   
1547866                       5                       10               44.9   
1547863                       0                        9               21.5   
...                         ...                      ...                ...   
993357                       37                        1               37.2   
116353                       37                        2               19.7   
1141354                      40                        1               29.3   
1281062                      40                        2               30.9   
253795                       40                        1               45.1   

         hobbies_count  daily_active_minutes_instagram  
12                   4                             5.0  
434555               3                             5.0  
434539               4                             5.0  
1547866              3                             5.0  
1547863              4                             5.0  
...                ...                             ...  
993357               2                           565.0  
116353               3                           565.0  
1141354              4                           572.0  
1281062              2                           578.0  
253795               5                           580.0  

[1547896 rows x 13 columns]

#Sanity check
dfs.describe()

#Making a scatterplot
scatterplot = sns.scatterplot(
    data=df,
    y='daily_active_minutes_instagram',
    x='age'
)

##Makin a hexbin graph with code help from Google Gemini
plt.figure(figsize=(10, 7))

hb = plt.hexbin(
    df['age'], 
    df['daily_active_minutes_instagram'], 
    gridsize=(40, 25),     
    cmap='viridis', 
    bins='log',          
    mincnt=1 
)

plt.colorbar(hb, label='log10(count)')
plt.xlabel('Age')
plt.ylabel('Daily Active Minutes')
plt.title('Activity Density by Age')

plt.show()

corr = dfs[['daily_active_minutes_instagram',
'perceived_stress_score',
'self_reported_happiness',
'exercise_hours_per_week']].corr()

sns.heatmap(corr, annot=True)

<Axes: >

#Scatter plot with regression line. Dataset is large so we sample first
sample_df = dfs.sample(30000)

sns.regplot(
    x="daily_active_minutes_instagram",
    y='self_reported_happiness',
    data=sample_df,
    scatter_kws={'s':2, 'alpha':0.05},
    line_kws={'color':'red'}
)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

#Repeat with stress score instead of self percieved happiness
#Scatter plot with regression line. Dataset is large so we sample first
sample_df = dfs.sample(30000)

sns.regplot(
    x="daily_active_minutes_instagram",
    y='perceived_stress_score',
    data=sample_df,
    scatter_kws={'s':2, 'alpha':0.05},
    line_kws={'color':'red'}
)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

	age	exercise_hours_per_week	sleep_hours_per_night	perceived_stress_score	self_reported_happiness	weekly_work_hours	hobbies_count	daily_active_minutes_instagram
count	1.547896e+06	1.547896e+06	1.547896e+06	1.547896e+06	1.547896e+06	1.547896e+06	1.547896e+06	1.547896e+06
mean	3.898544e+01	7.143480e+00	6.999384e+00	1.998774e+01	5.499804e+00	4.000145e+01	2.996140e+00	1.882298e+02
std	1.529453e+01	3.994556e+00	1.097098e+00	1.183258e+01	2.871232e+00	9.997320e+00	1.730508e+00	1.101278e+02
min	1.300000e+01	0.000000e+00	3.000000e+00	0.000000e+00	1.000000e+00	0.000000e+00	0.000000e+00	5.000000e+00
25%	2.600000e+01	4.000000e+00	6.300000e+00	1.000000e+01	3.000000e+00	3.320000e+01	2.000000e+00	1.010000e+02
50%	3.900000e+01	6.600000e+00	7.000000e+00	2.000000e+01	5.000000e+00	4.000000e+01	3.000000e+00	1.860000e+02
75%	5.200000e+01	9.700000e+00	7.700000e+00	3.000000e+01	8.000000e+00	4.680000e+01	4.000000e+00	2.710000e+02
max	6.500000e+01	2.410000e+01	1.000000e+01	4.000000e+01	1.000000e+01	8.000000e+01	1.000000e+01	5.800000e+02

Cleaning:¶

Now for the Visualizations, starting with tracking trends in columns using time as the x variable¶

Let's find strong relationships using a correlation heatmap¶

	user_id	app_name	age	gender	country	urban_rural	income_level	employment_status	education_level	relationship_status	...	last_login_date	average_session_length_minutes	content_type_preference	preferred_content_theme	privacy_setting_level	two_factor_auth_enabled	biometric_login_used	linked_accounts_count	subscription_status	user_engagement_score
0	1	Instagram	51	Female	India	Rural	High	Retired	Bachelor’s	Single	...	2025-11-02	5.0	Mixed	Tech	Private	Yes	No	0	Free	7.83
1	2	Instagram	64	Female	United Kingdom	Urban	Middle	Full-time employed	Other	Divorced	...	2025-03-22	14.8	Photos	Fashion	Public	No	No	3	Free	1.43
2	3	Instagram	41	Female	Canada	Urban	Middle	Student	Bachelor’s	In a relationship	...	2025-08-10	5.0	Mixed	Other	Public	Yes	Yes	1	Free	9.67
3	4	Instagram	27	Non-binary	South Korea	Urban	Middle	Unemployed	Master’s	In a relationship	...	2025-03-31	25.9	Stories	Tech	Private	No	No	1	Free	0.94
4	5	Instagram	55	Male	India	Urban	Upper-middle	Full-time employed	Bachelor’s	Single	...	2025-03-19	13.1	Videos	Food	Public	Yes	No	0	Free	1.03