In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
In [4]:
df = pd.read_csv(r"C:\Users\arici\OneDrive\Desktop\Data Science\ds-studio-ii\Project 1\instagram_usage_lifestyle.csv")
In [5]:
df.head()
Out[5]:
| user_id | app_name | age | gender | country | urban_rural | income_level | employment_status | education_level | relationship_status | ... | last_login_date | average_session_length_minutes | content_type_preference | preferred_content_theme | privacy_setting_level | two_factor_auth_enabled | biometric_login_used | linked_accounts_count | subscription_status | user_engagement_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | Female | India | Rural | High | Retired | Bachelor’s | Single | ... | 2025-11-02 | 5.0 | Mixed | Tech | Private | Yes | No | 0 | Free | 7.83 | |
| 1 | 2 | 64 | Female | United Kingdom | Urban | Middle | Full-time employed | Other | Divorced | ... | 2025-03-22 | 14.8 | Photos | Fashion | Public | No | No | 3 | Free | 1.43 | |
| 2 | 3 | 41 | Female | Canada | Urban | Middle | Student | Bachelor’s | In a relationship | ... | 2025-08-10 | 5.0 | Mixed | Other | Public | Yes | Yes | 1 | Free | 9.67 | |
| 3 | 4 | 27 | Non-binary | South Korea | Urban | Middle | Unemployed | Master’s | In a relationship | ... | 2025-03-31 | 25.9 | Stories | Tech | Private | No | No | 1 | Free | 0.94 | |
| 4 | 5 | 55 | Male | India | Urban | Upper-middle | Full-time employed | Bachelor’s | Single | ... | 2025-03-19 | 13.1 | Videos | Food | Public | Yes | No | 0 | Free | 1.03 |
5 rows × 58 columns
In [6]:
df.columns
Out[6]:
Index(['user_id', 'app_name', 'age', 'gender', 'country', 'urban_rural',
'income_level', 'employment_status', 'education_level',
'relationship_status', 'has_children', 'exercise_hours_per_week',
'sleep_hours_per_night', 'diet_quality', 'smoking', 'alcohol_frequency',
'perceived_stress_score', 'self_reported_happiness', 'body_mass_index',
'blood_pressure_systolic', 'blood_pressure_diastolic',
'daily_steps_count', 'weekly_work_hours', 'hobbies_count',
'social_events_per_month', 'books_read_per_year',
'volunteer_hours_per_month', 'travel_frequency_per_year',
'daily_active_minutes_instagram', 'sessions_per_day',
'posts_created_per_week', 'reels_watched_per_day',
'stories_viewed_per_day', 'likes_given_per_day',
'comments_written_per_day', 'dms_sent_per_week',
'dms_received_per_week', 'ads_viewed_per_day', 'ads_clicked_per_day',
'time_on_feed_per_day', 'time_on_explore_per_day',
'time_on_messages_per_day', 'time_on_reels_per_day', 'followers_count',
'following_count', 'uses_premium_features',
'notification_response_rate', 'account_creation_year',
'last_login_date', 'average_session_length_minutes',
'content_type_preference', 'preferred_content_theme',
'privacy_setting_level', 'two_factor_auth_enabled',
'biometric_login_used', 'linked_accounts_count', 'subscription_status',
'user_engagement_score'],
dtype='object')
In [7]:
#Delete unneccsary columuns
df = df.drop(columns=["app_name","urban_rural", 'education_level', 'smoking', 'alcohol_frequency', 'body_mass_index',
'blood_pressure_systolic', 'blood_pressure_diastolic', 'daily_steps_count',
'social_events_per_month', 'books_read_per_year','volunteer_hours_per_month',
'travel_frequency_per_year', 'sessions_per_day','posts_created_per_week',
'reels_watched_per_day', 'stories_viewed_per_day', 'likes_given_per_day',
'comments_written_per_day', 'dms_sent_per_week','dms_received_per_week',
'ads_viewed_per_day', 'ads_clicked_per_day','time_on_feed_per_day',
'time_on_explore_per_day', 'time_on_messages_per_day', 'time_on_reels_per_day',
'followers_count', 'following_count', 'uses_premium_features',
'notification_response_rate', 'account_creation_year', 'last_login_date',
'average_session_length_minutes', 'content_type_preference', 'preferred_content_theme',
'privacy_setting_level', 'two_factor_auth_enabled', 'biometric_login_used',
'linked_accounts_count', 'subscription_status', 'user_engagement_score',
'country', 'diet_quality', 'user_id'])
In [8]:
df.columns
Out[8]:
Index(['age', 'gender', 'income_level', 'employment_status',
'relationship_status', 'has_children', 'exercise_hours_per_week',
'sleep_hours_per_night', 'perceived_stress_score',
'self_reported_happiness', 'weekly_work_hours', 'hobbies_count',
'daily_active_minutes_instagram'],
dtype='object')
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1547896 entries, 0 to 1547895 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1547896 non-null int64 1 gender 1547896 non-null object 2 income_level 1547896 non-null object 3 employment_status 1547896 non-null object 4 relationship_status 1547896 non-null object 5 has_children 1547896 non-null object 6 exercise_hours_per_week 1547896 non-null float64 7 sleep_hours_per_night 1547896 non-null float64 8 perceived_stress_score 1547896 non-null int64 9 self_reported_happiness 1547896 non-null int64 10 weekly_work_hours 1547896 non-null float64 11 hobbies_count 1547896 non-null int64 12 daily_active_minutes_instagram 1547896 non-null float64 dtypes: float64(4), int64(4), object(5) memory usage: 153.5+ MB
Cleaning:¶
- Get rid of duplicate sets
- Check for null entries/missing data
- if no names, no identifier, whad do we do about that?
- Determine keys: What's the highest score per column, what's the lowest
In [10]:
#Getting rid of duplicates
df[['age', 'gender', 'income_level', 'employment_status',
'relationship_status', 'has_children', 'exercise_hours_per_week',
'sleep_hours_per_night', 'perceived_stress_score',
'self_reported_happiness', 'weekly_work_hours', 'hobbies_count',
'daily_active_minutes_instagram']].duplicated(keep=False).sum()
Out[10]:
np.int64(0)
In [11]:
#There are no duplicates!?
duplicates = df[df[['age', 'gender', 'income_level', 'employment_status',
'relationship_status', 'has_children', 'exercise_hours_per_week',
'sleep_hours_per_night', 'perceived_stress_score',
'self_reported_happiness', 'weekly_work_hours', 'hobbies_count',
'daily_active_minutes_instagram']].duplicated(keep=False)]
duplicates.sort_values(by='age')
Out[11]:
| age | gender | income_level | employment_status | relationship_status | has_children | exercise_hours_per_week | sleep_hours_per_night | perceived_stress_score | self_reported_happiness | weekly_work_hours | hobbies_count | daily_active_minutes_instagram |
|---|
In [12]:
#Check for missing data
df.isnull().sum()
Out[12]:
age 0 gender 0 income_level 0 employment_status 0 relationship_status 0 has_children 0 exercise_hours_per_week 0 sleep_hours_per_night 0 perceived_stress_score 0 self_reported_happiness 0 weekly_work_hours 0 hobbies_count 0 daily_active_minutes_instagram 0 dtype: int64
In [13]:
#Sort data based off how much
dfs = df.sort_values(by="daily_active_minutes_instagram")
print(dfs)
age gender income_level employment_status relationship_status \
12 36 Female Middle Full-time employed Single
434555 34 Male Middle Full-time employed Divorced
434539 61 Female High Freelancer Married
1547866 17 Male Low Full-time employed In a relationship
1547863 28 Female Lower-middle Freelancer In a relationship
... ... ... ... ... ...
993357 18 Male High Full-time employed Married
116353 15 Male Middle Student Single
1141354 13 Male High Freelancer Single
1281062 21 Female Middle Freelancer In a relationship
253795 22 Male Lower-middle Student In a relationship
has_children exercise_hours_per_week sleep_hours_per_night \
12 Yes 10.1 6.6
434555 Yes 3.1 8.2
434539 No 4.3 7.7
1547866 No 3.7 8.1
1547863 No 7.2 6.0
... ... ... ...
993357 No 6.8 8.2
116353 No 7.3 7.5
1141354 Yes 2.1 9.4
1281062 No 6.5 5.8
253795 Yes 8.2 5.7
perceived_stress_score self_reported_happiness weekly_work_hours \
12 7 10 34.7
434555 3 8 43.9
434539 4 6 27.0
1547866 5 10 44.9
1547863 0 9 21.5
... ... ... ...
993357 37 1 37.2
116353 37 2 19.7
1141354 40 1 29.3
1281062 40 2 30.9
253795 40 1 45.1
hobbies_count daily_active_minutes_instagram
12 4 5.0
434555 3 5.0
434539 4 5.0
1547866 3 5.0
1547863 4 5.0
... ... ...
993357 2 565.0
116353 3 565.0
1141354 4 572.0
1281062 2 578.0
253795 5 580.0
[1547896 rows x 13 columns]
In [14]:
#Sanity check
dfs.describe()
Out[14]:
| age | exercise_hours_per_week | sleep_hours_per_night | perceived_stress_score | self_reported_happiness | weekly_work_hours | hobbies_count | daily_active_minutes_instagram | |
|---|---|---|---|---|---|---|---|---|
| count | 1.547896e+06 | 1.547896e+06 | 1.547896e+06 | 1.547896e+06 | 1.547896e+06 | 1.547896e+06 | 1.547896e+06 | 1.547896e+06 |
| mean | 3.898544e+01 | 7.143480e+00 | 6.999384e+00 | 1.998774e+01 | 5.499804e+00 | 4.000145e+01 | 2.996140e+00 | 1.882298e+02 |
| std | 1.529453e+01 | 3.994556e+00 | 1.097098e+00 | 1.183258e+01 | 2.871232e+00 | 9.997320e+00 | 1.730508e+00 | 1.101278e+02 |
| min | 1.300000e+01 | 0.000000e+00 | 3.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 5.000000e+00 |
| 25% | 2.600000e+01 | 4.000000e+00 | 6.300000e+00 | 1.000000e+01 | 3.000000e+00 | 3.320000e+01 | 2.000000e+00 | 1.010000e+02 |
| 50% | 3.900000e+01 | 6.600000e+00 | 7.000000e+00 | 2.000000e+01 | 5.000000e+00 | 4.000000e+01 | 3.000000e+00 | 1.860000e+02 |
| 75% | 5.200000e+01 | 9.700000e+00 | 7.700000e+00 | 3.000000e+01 | 8.000000e+00 | 4.680000e+01 | 4.000000e+00 | 2.710000e+02 |
| max | 6.500000e+01 | 2.410000e+01 | 1.000000e+01 | 4.000000e+01 | 1.000000e+01 | 8.000000e+01 | 1.000000e+01 | 5.800000e+02 |
Now for the Visualizations, starting with tracking trends in columns using time as the x variable¶
In [15]:
#Making a scatterplot
scatterplot = sns.scatterplot(
data=df,
y='daily_active_minutes_instagram',
x='age'
)
the data set is too large. Use a Hexbin
In [20]:
##Makin a hexbin graph with code help from Google Gemini
plt.figure(figsize=(10, 7))
hb = plt.hexbin(
df['age'],
df['daily_active_minutes_instagram'],
gridsize=(40, 25),
cmap='viridis',
bins='log',
mincnt=1
)
plt.colorbar(hb, label='log10(count)')
plt.xlabel('Age')
plt.ylabel('Daily Active Minutes')
plt.title('Activity Density by Age')
plt.show()
Cool looking graph but age isn't the only variable we can use and is not the best indicator for happiness in this particular data set
Let's find strong relationships using a correlation heatmap¶
In [16]:
corr = dfs[['daily_active_minutes_instagram',
'perceived_stress_score',
'self_reported_happiness',
'exercise_hours_per_week']].corr()
sns.heatmap(corr, annot=True)
Out[16]:
<Axes: >
Now let's look more accurately into daily active minutes and stress score, and self perceived happiness.
In [17]:
#Scatter plot with regression line. Dataset is large so we sample first
sample_df = dfs.sample(30000)
sns.regplot(
x="daily_active_minutes_instagram",
y='self_reported_happiness',
data=sample_df,
scatter_kws={'s':2, 'alpha':0.05},
line_kws={'color':'red'}
)
plt.show
Out[17]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [18]:
#Repeat with stress score instead of self percieved happiness
#Scatter plot with regression line. Dataset is large so we sample first
sample_df = dfs.sample(30000)
sns.regplot(
x="daily_active_minutes_instagram",
y='perceived_stress_score',
data=sample_df,
scatter_kws={'s':2, 'alpha':0.05},
line_kws={'color':'red'}
)
plt.show
Out[18]:
<function matplotlib.pyplot.show(close=None, block=None)>