Predicting Product Sales from Product Features¶
1. Import libraries¶
In [22]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
2. Load the dataset¶
In [23]:
df = pd.read_csv(r"cleaned_ecommerce_sales_data.csv")
df.head()
Out[23]:
| product_id | product_name | category | price | review_score | review_count | sales_month_1 | sales_month_2 | sales_month_3 | sales_month_4 | sales_month_5 | sales_month_6 | sales_month_7 | sales_month_8 | sales_month_9 | sales_month_10 | sales_month_11 | sales_month_12 | annual_sales | avg_monthly_sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Product_1 | Clothing | 190.40 | 1.7 | 220 | 479 | 449 | 92 | 784 | 604 | 904 | 446 | 603 | 807 | 252 | 695 | 306 | 6421 | 535.083333 |
| 1 | 2 | Product_2 | Home & Kitchen | 475.60 | 3.2 | 903 | 21 | 989 | 861 | 863 | 524 | 128 | 610 | 436 | 176 | 294 | 772 | 353 | 6027 | 502.250000 |
| 2 | 3 | Product_3 | Toys | 367.34 | 4.5 | 163 | 348 | 558 | 567 | 143 | 771 | 409 | 290 | 828 | 340 | 667 | 267 | 392 | 5580 | 465.000000 |
| 3 | 4 | Product_4 | Toys | 301.34 | 3.9 | 951 | 725 | 678 | 59 | 15 | 937 | 421 | 670 | 933 | 56 | 157 | 168 | 203 | 5022 | 418.500000 |
| 4 | 5 | Product_5 | Books | 82.23 | 4.2 | 220 | 682 | 451 | 649 | 301 | 620 | 293 | 411 | 258 | 854 | 548 | 770 | 257 | 6094 | 507.833333 |
3. Inspect and clean the data¶
In [24]:
df.shape
Out[24]:
(1000, 20)
In [25]:
df.columns.tolist()
Out[25]:
['product_id', 'product_name', 'category', 'price', 'review_score', 'review_count', 'sales_month_1', 'sales_month_2', 'sales_month_3', 'sales_month_4', 'sales_month_5', 'sales_month_6', 'sales_month_7', 'sales_month_8', 'sales_month_9', 'sales_month_10', 'sales_month_11', 'sales_month_12', 'annual_sales', 'avg_monthly_sales']
In [26]:
df.dtypes
Out[26]:
product_id int64 product_name object category object price float64 review_score float64 review_count int64 sales_month_1 int64 sales_month_2 int64 sales_month_3 int64 sales_month_4 int64 sales_month_5 int64 sales_month_6 int64 sales_month_7 int64 sales_month_8 int64 sales_month_9 int64 sales_month_10 int64 sales_month_11 int64 sales_month_12 int64 annual_sales int64 avg_monthly_sales float64 dtype: object
In [27]:
df.isna().sum()
Out[27]:
product_id 0 product_name 0 category 0 price 0 review_score 0 review_count 0 sales_month_1 0 sales_month_2 0 sales_month_3 0 sales_month_4 0 sales_month_5 0 sales_month_6 0 sales_month_7 0 sales_month_8 0 sales_month_9 0 sales_month_10 0 sales_month_11 0 sales_month_12 0 annual_sales 0 avg_monthly_sales 0 dtype: int64
Data set has no duplicate data points
In [28]:
#Summary Statistics
print(df[["price", "review_score", "review_count", "annual_sales", "avg_monthly_sales"]].describe())
price review_score review_count annual_sales \
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 247.677130 3.027600 526.506000 6019.912000
std 144.607983 1.171243 282.269932 992.273787
min 7.290000 1.000000 1.000000 2972.000000
25% 121.810000 2.000000 283.750000 5392.000000
50% 250.920000 3.100000 543.000000 5992.000000
75% 373.435000 4.000000 772.000000 6680.000000
max 499.860000 5.000000 999.000000 9151.000000
avg_monthly_sales
count 1000.000000
mean 501.659333
std 82.689482
min 247.666667
25% 449.333333
50% 499.333333
75% 556.666667
max 762.583333
4. Exploratory data analysis¶
Checking for correlation
In [29]:
corr_cols = ["price", "review_score", "review_count", "annual_sales", "avg_monthly_sales"]
corr_matrix = df[corr_cols].corr(numeric_only=True)
corr_matrix
Out[29]:
| price | review_score | review_count | annual_sales | avg_monthly_sales | |
|---|---|---|---|---|---|
| price | 1.000000 | 0.028960 | 0.042189 | -0.015978 | -0.015978 |
| review_score | 0.028960 | 1.000000 | 0.027351 | -0.018186 | -0.018186 |
| review_count | 0.042189 | 0.027351 | 1.000000 | -0.069393 | -0.069393 |
| annual_sales | -0.015978 | -0.018186 | -0.069393 | 1.000000 | 1.000000 |
| avg_monthly_sales | -0.015978 | -0.018186 | -0.069393 | 1.000000 | 1.000000 |
In [30]:
plt.figure(figsize=(8, 5))
plt.hist(df["annual_sales"], bins=30, edgecolor="black")
plt.title("Distribution of Annual Sales")
plt.xlabel("Annual Sales")
plt.ylabel("Number of Products")
plt.tight_layout()
plt.show()
In [31]:
plt.figure(figsize=(8, 5))
plt.scatter(df["review_count"], df["annual_sales"], alpha=0.7)
plt.title("Review Count vs Annual Sales")
plt.xlabel("Review Count")
plt.ylabel("Annual Sales")
plt.tight_layout()
plt.show()
In [32]:
plt.figure(figsize=(7, 5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()
5. Prepare for machine learning¶
The target variable is annual_sales. The predictor variables are price, review_score, review_count, and category.
In [33]:
feature_cols = ["price", "review_score", "review_count", "category"]
target_col = "annual_sales"
X = df[feature_cols]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=42
)
preprocessor = ColumnTransformer(
transformers=[
("num", Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), ["price", "review_score", "review_count"]),
("cat", Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]), ["category"]),
]
)
6. Train and evaluate machine learning models¶
In [ ]:
models = {
"Linear Regression": LinearRegression(),
"Random Forest": RandomForestRegressor(
n_estimators=300,
random_state=42,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
),
"Gradient Boosting": GradientBoostingRegressor(random_state=42),
}
results = []
fitted_pipelines = {}
for model_name, model in models.items():
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("model", model),
]
)
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)
# Put results in a list
results.append(
{
"model": model_name,
"MAE": mae,
"RMSE": rmse,
"R2": r2,
}
)
fitted_pipelines[model_name] = pipeline
results_df = pd.DataFrame(results).sort_values("R2", ascending=False)
results_df
Out[ ]:
| model | MAE | RMSE | R2 | |
|---|---|---|---|---|
| 0 | Linear Regression | 777.932688 | 956.896757 | -0.006772 |
| 1 | Random Forest | 830.719017 | 1038.505538 | -0.185820 |
| 2 | Gradient Boosting | 845.317601 | 1048.366438 | -0.208446 |
In [ ]:
#Grab best result from list
best_model_name = results_df.iloc[0]["model"]
best_pipeline = fitted_pipelines[best_model_name]
best_preds = best_pipeline.predict(X_test)
print("Best model based on R2:", best_model_name)
print(results_df.iloc[0])
Best model based on R2: Linear Regression model Linear Regression MAE 777.932688 RMSE 956.896757 R2 -0.006772 Name: 0, dtype: object
In [36]:
plt.figure(figsize=(7, 5))
plt.scatter(y_test, best_preds, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle="--")
plt.title(f"Actual vs Predicted Annual Sales ({best_model_name})")
plt.xlabel("Actual Annual Sales")
plt.ylabel("Predicted Annual Sales")
plt.tight_layout()
plt.show()
7. Interpret the best model¶
In [37]:
preprocessor = best_pipeline.named_steps["preprocessor"]
model = best_pipeline.named_steps["model"]
feature_names = preprocessor.get_feature_names_out()
if hasattr(model, "feature_importances_"):
interpretation_df = pd.DataFrame(
{"feature": feature_names, "importance": model.feature_importances_}
).sort_values("importance", ascending=False)
interpretation_df.head(10)
elif hasattr(model, "coef_"):
interpretation_df = pd.DataFrame(
{"feature": feature_names, "coefficient": model.coef_}
).sort_values("coefficient", key=np.abs, ascending=False)
interpretation_df.head(10)
else:
interpretation_df = pd.DataFrame()
interpretation_df
In [38]:
if "importance" in interpretation_df.columns:
top10 = interpretation_df.head(10).sort_values("importance")
plt.figure(figsize=(9, 5))
plt.barh(top10["feature"], top10["importance"])
plt.title(f"Top 10 Most Important Features ({best_model_name})")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
elif "coefficient" in interpretation_df.columns:
top10 = interpretation_df.head(10).sort_values("coefficient")
plt.figure(figsize=(9, 5))
plt.barh(top10["feature"], top10["coefficient"])
plt.title(f"Top 10 Coefficients by Magnitude ({best_model_name})")
plt.xlabel("Coefficient")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
8. Conclusion¶
The models in this notebook did not predict annual sales well. Even the best model had an R² value below 0, which suggests that price, review score, review count, and category are weak predictors of annual sales in this dataset. This is still an important result because it shows that better sales prediction likely requires more useful features, such as advertising, discounts, seasonality, or brand strength.